# This notebook outlines the final model tuning and set of predictions that ML Marauders have made for CS 181 Practical 1

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import Ridge, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error

## Load and gently process the data (much of the preprocessing was done in FINAL.ipynb)

In [2]:
# df_train = pd.read_csv('sam_data/rdk_feat_eng_whole_df_train_orig_features.csv')
# df_test = pd.read_csv('sam_data/rdk_feat_eng_whole_df_test_orig_features.csv')
df_train = pd.read_csv('final_data/FINAL_train.csv')
df_test = pd.read_csv('final_data/FINAL_test.csv')
df_train.head()

Unnamed: 0,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,num_radical_electrons,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,3,0,0,0,0,470.462,470.907296,461.39,130,0,...,1,0,0,0,0,0,0,0,0,1.19
1,1,0,0,0,5,352.545,352.085202,336.417,118,0,...,1,0,0,1,0,0,0,0,0,1.6
2,2,0,0,0,1,399.576,399.032016,386.472,128,0,...,1,0,0,0,1,0,0,0,0,1.49
3,1,0,0,0,4,379.567,379.084867,362.431,128,0,...,1,0,0,0,1,0,0,0,0,1.36
4,1,0,0,0,0,396.391,396.042944,388.327,136,0,...,1,0,0,0,0,0,0,0,0,1.98


In [3]:
df_test.head()

Unnamed: 0,Id,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,2,0,0,0,0,409.499,409.045587,398.411,136,...,0,1,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,352.469,351.991109,344.405,110,...,0,1,0,0,0,0,0,0,0,0
2,3,1,0,0,0,2,514.569,514.948537,501.465,146,...,0,1,0,0,0,0,0,0,0,0
3,4,2,0,0,0,4,376.491,376.10319,360.363,132,...,0,1,0,0,0,0,0,0,0,0
4,5,3,0,0,0,0,569.637,569.844956,559.557,154,...,0,1,0,0,0,0,0,0,0,0


In [4]:
# Using StandardScaler to normalize non-binary columns
scaler = StandardScaler()
binary_cols = ['feat_%03d' % ii for ii in range(1,257)]
binary_cols.append('has_benzothiophene')
binary_cols.append('has_carbazole')
binary_cols.append('has_fluorene')
binary_cols.append('smiles')
binary_cols.append('Id')
non_binary_cols = np.array([col for col in df_test.columns if col not in binary_cols]).flatten()

df_train[non_binary_cols] = scaler.fit_transform(df_train[non_binary_cols])
df_test[non_binary_cols] = scaler.transform(df_test[non_binary_cols])

In [5]:
# Drop the 'smiles' and 'Id' columns
df_train = df_train.drop(['smiles'], axis=1)
df_test = df_test.drop(['Id'], axis=1)
df_test = df_test.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
X_test = df_test.values
print "Train features:", X_train.shape, "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (999997, 369) Train gap: (999997,)
Test features: (824230, 369)


In [6]:
# Split training data into training and validation sets as well as begin some k-fold CV
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.3, random_state=42)

#### For classification purposes, round target values to nearest .5

In [7]:
# Round to nearest integer
Y_clf_train, Y_clf_valid = np.round(cross_Y_train), np.round(cross_Y_valid)
Y_full_clf_train = np.round(Y_train)
# Round to nearest .5
# Y_clf_train, Y_clf_valid = (((np.round(2*cross_Y_train)/2.0)-0.5)/0.5).astype(int), (((np.round(2*cross_Y_valid)/2.0)-0.5)/0.5).astype(int)
# Y_full_clf_train = (((np.round(2*Y_train)/2.0)-.5)/.5).astype(int)
# Round to nearest .25
# Y_clf_train, Y_clf_valid = (((np.round(4*cross_Y_train)/4.0)-.25)/.25).astype(int), (((np.round(4*cross_Y_valid)/4.0)-.25)/.25).astype(int)
# Y_full_clf_train = (((np.round(4*Y_train)/4.0)-.25)/.25).astype(int)

In [8]:
print "'Training' features: ", cross_X_train.shape
print "'Validate' features: ", cross_X_valid.shape

'Training' features:  (699997, 369)
'Validate' features:  (300000, 369)


# GOAL:

This notebook is set-up to chain together classification and regression methods. The thought is that we can, after we've trained the two models, to first apply a classifier to the data (in a clustering kind of sense) and then use the category or neighborhood that the sample is assigned as an additional feature to perform regression. Here the category or label will be the closest integer to the gap value. The idea behind this is to hijack the regression into a local region of the expected HOMO-LUMO gap based on the label. The hope is that this will pin the regressor closer to the right value. 

It's imperative that we get as accurate of a classifier as we can.

Fingers crossed.

### First: Let's build a classifier that will adequately label the samples

We'll start with Logistic Regression and try to fit the best model using a collection of C values

In [9]:
%%time

logReg_training_acc = 0
logReg_test_acc = 0

Cs = [0.001, 0.01, 0.1, 1.0, 10.0]

for c in Cs:
    clf_logReg=LogisticRegression(penalty="l2",C=c, solver='lbfgs')
    clf_logReg.fit(cross_X_train,Y_clf_train)
    training_acc = clf_logReg.score(cross_X_train,Y_clf_train)
    test_acc = clf_logReg.score(cross_X_valid,Y_clf_valid)
    print c, test_acc
    if logReg_test_acc < test_acc:
        logReg_test_acc = test_acc
        logReg_training_acc = training_acc
        best_logReg = clf_logReg

0.001 0.660916666667
0.01 0.669706666667
0.1 0.67299
1.0 0.67306
10.0 0.673676666667
CPU times: user 39min 47s, sys: 38.2 s, total: 40min 25s
Wall time: 23min 21s


In [10]:
logReg_training_acc = best_logReg.score(cross_X_train,Y_clf_train)
logReg_test_acc = best_logReg.score(cross_X_valid,Y_clf_valid)
print "Training Accuracy: %0.3f" % logReg_training_acc
print "Test Accuracy: %0.3f" % logReg_test_acc

Training Accuracy: 0.673
Test Accuracy: 0.674


#### Concatenate predicted labels onto test/validation set

In [15]:
Y_clf_pred = best_logReg.predict(cross_X_valid)
X_train_clf = np.vstack((cross_X_train.T,Y_clf_train)).T
X_valid_clf = np.vstack((cross_X_valid.T,Y_clf_valid)).T

## Now onto ExtraTrees, fronted with PCA

In [16]:
%%time
pcaExtraTrees_RMSE = 100

pca_components = [40, 45, 60]
num_estimators = [100, 200]

for comps in pca_components:
    pca = PCA(n_components=comps)
    X_train_tr = pca.fit_transform(X_train_clf)
    X_valid_tr = pca.transform(X_valid_clf)
    
    for n_estimators in num_estimators:
        
        extratrees_clf = ExtraTreesRegressor(n_estimators=n_estimators,n_jobs=-1)
        extratrees_clf.fit(X_train_tr,cross_Y_train)
        y_pred = extratrees_clf.predict(X_valid_tr)
        
        RMSE = np.sqrt(mean_squared_error(cross_Y_valid,y_pred))
        if RMSE < pcaExtraTrees_RMSE:
            print comps, n_estimators
            print RMSE
            pcaExtraTrees_RMSE = RMSE
            pcaExtraTrees_estimators = n_estimators
            pcaExtraTrees_components = comps
            best_pcaExtraTrees = extratrees_clf
            
print "PCA with {0} components chained ExtraTrees with {1} estimators had RMSE of {2}".format(pcaExtraTrees_components,pcaExtraTrees_estimators,pcaExtraTrees_RMSE)        

40 100
0.100909833779
40 200
0.100748042017
45 100
0.100666057888
45 200
0.100531872291
PCA with 45 components chained ExtraTrees with 200 estimators had RMSE of 0.100531872291
CPU times: user 5h 28min 18s, sys: 4min 11s, total: 5h 32min 30s
Wall time: 1h 30min 55s


## We're also going to tune a Ridge Regression to have double coverage

In [17]:
%%time
ridge_RMSE = 100
alphas = np.logspace(-4, 1, 30)

for alpha in alphas:
    ridge_clf = Ridge(alpha=alpha)
    ridge_clf.fit(X_train_clf, cross_Y_train)
    y_pred = ridge_clf.predict(X_valid_clf)
    
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid,y_pred))
    if RMSE < ridge_RMSE:
        ridge_RMSE = RMSE
        ridge_alpha = alpha
        best_ridge = ridge_clf
        
print "Ridge RMSE: {0} with alpha: {1}".format(ridge_RMSE,ridge_alpha)

Ridge RMSE: 0.117605336561 with alpha: 0.0117210229753
CPU times: user 4min 25s, sys: 29.6 s, total: 4min 55s
Wall time: 2min 23s


In [20]:
with open('final_classifier_and_regressors.pkl','w') as f:
    pickle.dump((best_logReg,best_pcaExtraTrees,pcaExtraTrees_components,best_ridge),f)

In [8]:
# with open('final_classifier_and_regressors.pkl','r') as fopen:
#     best_logReg, best_pcaExtraTrees, pcaExtraTrees_components, best_ridge = pickle.load(fopen)
    
# After opening this, you may need to re-configure the test and training set, that is if you have to restart the kernel

## Train on full training set, run on full test set

Here we will train the classifier and the regressions

In [9]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

### Logistic Regression

In [10]:
%%time
# Train classifier
# best_logReg.fit(X_train,Y_full_clf_train)

# Run classifier on test set
label_pred = best_logReg.predict(X_test)

# Concatenate full training labels to full test set
X_full_train_clf = np.vstack((X_train.T,Y_full_clf_train)).T
# Concatenate predicted labels onto test set as a new feature
X_test_clf = np.vstack((X_test.T,label_pred)).T

print "Completed Classification of test set"

Completed Classification of test set
CPU times: user 13min 43s, sys: 23.3 s, total: 14min 7s
Wall time: 8min 21s


### ExtraTrees

In [11]:
%%time

# Train ExtraTrees Regressor
pca = PCA(n_components=pcaExtraTrees_components)
X_full_train_tr = pca.fit_transform(X_full_train_clf)
best_pcaExtraTrees.fit(X_full_train_tr,Y_train)

# Run ExtraTrees Regressor
X_test_clf_tr = pca.transform(X_test_clf)
pcaExtraTrees_pred = best_pcaExtraTrees.predict(X_test_clf_tr)

# Save the ExtraTrees Predictions
write_to_file("pca_extraTrees_FINAL_TWK_10Feb.csv",pcaExtraTrees_pred)

print "Completed ExtraTrees Regression"

Completed ExtraTrees Regression
CPU times: user 1h 53min 31s, sys: 1min 37s, total: 1h 55min 8s
Wall time: 32min 15s


### Ridge Regression

In [12]:
# Train Ridge Regression
best_ridge.fit(X_full_train_clf,Y_train)

# Run Ridge Regressor
ridge_pred = best_ridge.predict(X_test_clf)

# Save the Ridge Predictions
write_to_file("ridge_FINAL_TWK_10Feb.csv",ridge_pred)

Save the data