In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score
import pickle
import numpy as np

%matplotlib inline

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
            ('clf', LogisticRegression(random_state=42))])

pipe_lr_pca = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', LogisticRegression(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
            ('clf', RandomForestClassifier(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
            ('clf', svm.SVC(random_state=42))])

pipe_svm_pca = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', svm.SVC(random_state=42))])

# Set grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_fl,
        'clf__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_leaf': param_range,
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

# Construct grid searches
jobs = -1

gs_lr = GridSearchCV(estimator=pipe_lr,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=10) 

gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=10)

gs_rf = GridSearchCV(estimator=pipe_rf,
            param_grid=grid_params_rf,
            scoring='accuracy',
            cv=10, 
            n_jobs=jobs)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
            param_grid=grid_params_rf,
            scoring='accuracy',
            cv=10, 
            n_jobs=jobs)

gs_svm = GridSearchCV(estimator=pipe_svm,
            param_grid=grid_params_svm,
            scoring='accuracy',
            cv=10,
            n_jobs=jobs)

gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca,
            param_grid=grid_params_svm,
            scoring='accuracy',
            cv=10,
            n_jobs=jobs)

# List of pipelines for ease of iteration
grids = [gs_lr, gs_lr_pca, gs_rf, gs_rf_pca, gs_svm, gs_svm_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression', 1: 'Logistic Regression w/PCA', 
        2: 'Random Forest', 3: 'Random Forest w/PCA', 
        4: 'Support Vector Machine', 5: 'Support Vector Machine w/PCA'}

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])	
    # Fit grid search	
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))

In [None]:
#
1. Load Data
2. Clean Data
3. Run PCA with different n_components to select the best
4. 

In [None]:
#Pipeline with PCA & GridSearch
#https://www.kaggle.com/gaborvecsei/pipeline-pca-gridsearch
    

In [None]:
#Add PCA Visualization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

pca_2d = PCA(n_components=2)
Z_2d = pca_2d.fit_transform(X_scaled)

fig, ax = plt.subplots(figsize=(10, 8))

ax.scatter(Z_2d[y_train==0, 0], Z_2d[y_train==0, 1], label='isFraud: 0')
ax.scatter(Z_2d[y_train==1, 0], Z_2d[y_train==1, 1], label='isFraud: 1')
ax.set(title='PCA 2-d projection', xlabel='Z[0]', ylabel='Z[1]')
ax.legend()


In [None]:
#PCA
from sklearn.model_selection import train_test_split

# 1. split into training and test sets
# 2. do PCA
# 3. fit on train set

X_train, X_test, y_train, y_test = train_test_split(X, y)

pca = PCA(n_components=150)
Z_train = pca.fit_transform(X_train)

model = LogisticRegression()
model_pca = LogisticRegression()

model.fit(X_train, y_train)
model_pca.fit(Z_train, y_train)

# 4. Compare scores on test set
Z_test = pca.transform(X_test) # not re-fitting PCA

print('Accuracy (PCA)', model_pca.score(Z_test, y_test))

print('Accuracy (no PCA)', model.score(X_test, y_test))

In [None]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(model):
    # 3-fold cross validation to get learning curve R2 scores
    # using default train_sizes
    train_sizes, train_scores, val_scores = learning_curve(model,
                                                           Z_train, 
                                                           y_train, cv=3)

    # plot learning curve:
    #   plot train_scores vs. train_sizes
    #   plot val_scores vs. train_sizes
    # train_sizes is the number of training samples used for training

    fig, ax = plt.subplots()
    ax.plot(train_sizes, train_scores.mean(axis=1),
            label='train') # average for 5-folds
    ax.plot(train_sizes, val_scores.mean(axis=1),
            label='val')
    ax.legend()
    ax.set(title='Learning curve', xlabel='Train size', ylabel='R2')
    return train_sizes, train_scores, val_scores


In [3]:
from sklearn.svm import SVC, SVR, LinearSVC

# Cross Validate
# SGDRegressor.learning_rate
model = SVC(max_iter=1000, tol=1e-3,
                     random_state=8)

# - cross validate (trains models)
scores = cross_validate(model, Z_train, y_train, cv=5,
                        return_train_score=True, return_estimator=True)
# scores

# - learning curve (plot for overfit / underfit)
train_sizes, train_scores, val_scores = plot_learning_curve(model)


NameError: name 'cross_validate' is not defined

In [None]:
#Grid Search
# Use GridSearchCV to automate finding the best combination

from sklearn.model_selection import GridSearchCV

# Parameters to try:
# learning_rate='invscaling'
#  'constant' with eta0=1e-1
#  'constant' with eta0=1e-4,
#  'optimal' with default eta0
params = {
    'learning_rate' : ['invscaling', 'constant', 'optimal'],
    'eta0' : [0.01, 1e-1, 1e-4]
}

# consider setting n_jobs to run more in parallel if too slow
gs = GridSearchCV(model, params, cv=3, return_train_score=True)
gs.fit(Z_train, y_train)

print(gs.best_score_) # best score of 9 models
print(gs.best_params_) # best parameters
gs.best_estimator_ # best of 9 models (9 combinations of params)```

results_df = pd.DataFrame(gs.cv_results_)
results_df

In [None]:
#Predict
y_pred = model.predict(Z_test)
print(classification_report(y_test, y_pred))

ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
ax.set(xlabel='Prediction', ylabel='Truth')

In [None]:
#Plot Learning Curve
fig, axes = plt.subplots(nrows=6, ncols=2, figsize=(10, 30))
axes = axes.flatten()

for i in range(len(axes)):
    params = results_df.params.values[i]
    model = SGDRegressor(max_iter=1000, tol=1e-3,
                         random_state=8,
                         learning_rate=params['learning_rate'],
                         eta0=params['eta0'])
    train_sizes, train_scores, val_scores = learning_curve(model,
                                                       Z_train, 
                                                       y_train, cv=3)

    ax = axes[i]
    ax.plot(train_sizes, train_scores.mean(axis=1), label='train')
    ax.plot(train_sizes, val_scores.mean(axis=1), label='val')
    ax.legend()
    ax.set(title=str(params), xlabel='Train size', ylabel='R2')  

In [None]:
#Plot ROC / AUC 
prob_sgd = sgd.decision_function(Z_test)
fpr_sgd, tpr_sgd, _ = roc_curve(y_test, prob_sgd)
auc_sgd = auc(fpr_sgd, tpr_sgd)

prob_svc = svc.decision_function(Z_test)
fpr_svc, tpr_svc, _ = roc_curve(y_test, prob_svc)
auc_svc = auc(fpr_svc, tpr_svc)

fig, ax = plt.subplots(figsize=(10, 8))
ax.plot(fpr_sgd, tpr_sgd, label='Logistic Regression (auc %.3f)' % auc_sgd)
ax.plot(fpr_svc, tpr_svc, label='SVC (auc %.3f)' % auc_svc)
ax.set(xlabel='False positive rate', ylabel='True positive rate', title='ROC curve')
ax.legend()

In [None]:
all_models = []

# all_models.append(('LR', (LogisticRegression(random_state=seed))))

all_models.append(('KNNC', KNeighborsClassifier()))
all_models.append(('KNNR', KNeighborsRegressor()))

for name, model in all_models:
    
    X = x_train
    scaler = StandardScaler()
    scaler.fit(X)
    
    Z = scaler.transform(X)
    
#    scaler = None
#    Z = X

#     kfold = model_selection.KFold(n_splits=3, random_state=seed)
#     cv_results = model_selection.cross_val_score(model, Z, mtr.getTarget(3), cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#     print(msg)
    
    oClassifier = MultiOutputClassifier(model, n_jobs=7)
    oClassifier.fit(Z, mtr.getTargets()) 
    print(oClassifier)
    s = oClassifier.score(Z, mtr.getTargets())
    if(oClassifier.score(Z, mtr.getTargets()) == 1.0):
        print( name, ' ', str(f), ' ', str(s))
    store_prediction(mtr, oClassifier, f, scaler=scaler, name=name)
    start = time.clock()
    print(str(f), " Time taken: ", (time.clock() - start),  " ")


