##Load Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

%matplotlib inline

In [None]:
#Initialize Variable
v_input_datafile = '../input/train.csv'


In [None]:
# Load data as Pandas dataframe
# Optional Parametrs: header=None, names=col_names, sep='\s+', na_values=['?']
# col_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
input_df = pd.read_csv(v_input_datafile, )
input_df.info()

#Pick features
features = df.columns[(df.columns != target) & 
                      (df.columns != 'day') &
                      (df.columns != 'month')]


In [None]:
#Drop NA
#df[df.isna().any(axis=1)]

# df.isna().any(axis=1) returns True if a row has missing values
#
# mask = df.isna().any(axis=1)
# df[mask] return only the True rows (masking)


print( "Before dropping NA: ", input_df.shape )
df = input_df.dropna()
print ( "After dropping NA: ", df.shape )

#Drop Columns
drop_lst = ['casual', 'registered']
df = df.drop(drop_lst, axis=1)
df.head()

#Convert date time
df['datetime'] = pd.to_datetime(df['datetime'])
df['dow'] = df['datetime'].dt.dayofweek
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['week'] = df['datetime'].dt.week
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df = df.set_index(df['datetime'])
df = df.drop(labels='datetime', axis=1)

In [None]:
#For each feature list down the unique values
#df['Title'].value_counts()
# df['horsepower']=pd.to_numeric(df['horsepower'], errors='coerce')  #Convert non numeric values to NAN

In [None]:
#side by side comparison of 2011 and 2012 data
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

names = ['1', '2', '3', '4']

values = df['season'][df['year'] == 2011].value_counts()
ax[0].bar(names, values)

values = df['season'][df['year'] == 2012].value_counts()
ax[1].bar(names, values)

fig.suptitle('Seasons in 2011 & 2012');

#Plot value Counts
names = ['2011', '2012']
values = df['year'].value_counts()
plt.bar(names, values);

In [None]:
#Plot both on same chart
plt.hist(df['temp'][df['year'] == 2011], alpha=0.5, label='2011')
plt.hist(df['temp'][df['year'] == 2012], alpha=0.5, label='2012')

plt.legend(loc='upper right');

In [None]:
#Plot Data
sns.pairplot(df, kind='reg')
#Plot Histogram
plt.hist(df['count']);


count_log = np.log(df['count'])
plt.hist(count_log);

#To Normalize
count_boxcox, _ = stats.boxcox(df['count'])
count_boxcox


In [None]:
sns.pointplot(x=df['temp'], y=df['count'])
fig = plt.gcf()
fig.set_size_inches(30,12);

In [None]:
#Heatmap
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', ax=ax)

# integer: 'd'
# floating point: 'f'
# floating point with 3 decimal places: '.3f'


#Another Heatmap
cor_mat = df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True);


In [None]:
#Should Interpoloate NAN
df1 = df.interpolate() # finds all NaN values and linear interp
df1.info()


In [None]:
#Interpolate

#Get index of all rows which contains NAN
index = df[df.isna().any(axis=1)].index
df1.iloc[index]

# plot before and after

# 1. plot original df horsepower
# 2. add the interpolated values

fig, ax = plt.subplots()
ax.scatter(df['horsepower'].index, df['horsepower'])
ax.set(xlabel='index', ylabel='horsepower')

ax.scatter(index, df1.iloc[index]['horsepower'], label='interp')
ax.legend()

In [None]:
# PCA
# Goal: plot all 8 features at the same time with target (y)
# PCA is a technique to reduce to 2 or 3 dimensions

# Principal Component Analysis
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# make sure df is cleaned
df.dropna(inplace=True)
X = df.loc[:, df.columns != 'mpg']
y = df['mpg']

# best practice is to scale before performing PCA
# (because PCA uses covariance which is sensitive to scaling)
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

pca = PCA(n_components=1) # X in 1-d so we can plot X vs y in 2-d
pca.fit(X_scaled) # setup to compute PCA

Z = pca.transform(X_scaled) # actually perform PCA

# dimension is 1 only because we want to plot
print('before', X.shape, 'after', Z.shape)
Z

# plot Z vs. y to explore relationship
fig, ax = plt.subplots()
ax.scatter(Z, y)
ax.set(title='PCA of data vs. mpg',
      xlabel='1-d PCA projection of data',
      ylabel='mpg')



In [None]:
#Plot 3D
%matplotlib inline
#%matplotlib notebook

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

pca2 = PCA(n_components=2)
pca2.fit(X_scaled)
Z2 = pca2.transform(X_scaled)

# plot Z2[0], Z2[1] vs y in 3-d
ax.scatter(Z2[:, 0], Z2[:, 1], y)
ax.set_xlabel('dimension 0')
ax.set_ylabel('dimension 1')
ax.set_zlabel('y')
plt.show()

In [None]:
#swarm plot
sns.swarmplot(x='Sex', y='Fare', hue='Survived', data=df)
plt.show()

In [None]:
# impute missing Fare values using median of Pclass groups
class_fares = dict(df.groupby('Pclass')['Fare'].median())

# create a column of the average fares
df['fare_med'] = df['Pclass'].apply(lambda x: class_fares[x])

# replace all missing fares with the value in this column
df['Fare'].fillna(df['fare_med'], inplace=True, )
del df['fare_med']

#imput using  'backfill' method.
sns.catplot(x='Embarked', y='Survived', data=df,
            kind='bar', palette='muted', ci=None)
plt.show()
df['Embarked'].fillna(method='backfill', inplace=True)

In [None]:
# helper function to reduce coding
def plot_learning_curve(model):
    # 3-fold cross validation to get learning curve R2 scores
    # using default train_sizes
    train_sizes, train_scores, val_scores = learning_curve(model,
                                                           Z_train, 
                                                           y_train, cv=3)

    # plot learning curve:
    #   plot train_scores vs. train_sizes
    #   plot val_scores vs. train_sizes
    # train_sizes is the number of training samples used for training

    fig, ax = plt.subplots()
    ax.plot(train_sizes, train_scores.mean(axis=1),
            label='train') # average for 5-folds
    ax.plot(train_sizes, val_scores.mean(axis=1),
            label='val')
    ax.legend()
    ax.set(title='Learning curve', xlabel='Train size', ylabel='R2')
    return train_sizes, train_scores, val_scores


In [None]:
# SGDRegressor.learning_rate
# default was 'invscaling'
# try: 'constant' with eta0=1e-1, 'constant' with eta0=1e-4,
# 'optimal' with default eta0

# - make change
model = SGDRegressor(max_iter=1000, tol=1e-3,
                     random_state=8,
                     learning_rate='optimal')

# - cross validate (trains models)
scores = cross_validate(model, Z_train, y_train, cv=5,
                        return_train_score=True, return_estimator=True)
# scores

# - learning curve (plot for overfit / underfit)
train_sizes, train_scores, val_scores = plot_learning_curve(model)

In [None]:
pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('LASSO', Lasso(random_state=42))])))
pipelines.append(('ScaledRID', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RID', Ridge(random_state=42))])))
pipelines.append(('ScaledKNN', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor(n_neighbors=2))])))
pipelines.append(('ScaledCART', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor(random_state=42))])))
pipelines.append(('ScaledGBM', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor(random_state=42))])))
pipelines.append(('ScaledRFR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RFR', RandomForestRegressor(random_state=42))])))
pipelines.append(('ScaledSVR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('SVR', SVR(kernel='linear'))])))
pipelines.append(('ScaledXGBR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('XGBR', XGBRegressor(random_state=42))])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(random_state=42)
    cv_results = -cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_log_error')
    results.append(np.sqrt(cv_results))
    names.append(name)
    msg = "{}: {} ({})".format(name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

labels = ...
predictions = ...

cm = confusion_matrix(labels, predictions)
recall = np.diag(cm) / np.sum(cm, axis = 1)
precision = np.diag(cm) / np.sum(cm, axis = 0)