# This notebook outlines the final model tuning and set of predictions that ML Marauders have made for CS 181 Practical 1

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import Ridge, LogisticRegression, LogisticRegressionCV, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error

## Load and gently process the data (much of the preprocessing was done in FINAL.ipynb)

In [2]:
# df_train = pd.read_csv('sam_data/rdk_feat_eng_whole_df_train_orig_features.csv')
# df_test = pd.read_csv('sam_data/rdk_feat_eng_whole_df_test_orig_features.csv')
# df_train = pd.read_csv('final_data/FINAL_train.csv')
# df_test = pd.read_csv('final_data/FINAL_test.csv')
df_train = pd.read_csv('FINAL_interactions/FINAL_train_25_interactions.csv')
df_test = pd.read_csv('FINAL_interactions/FINAL_test_25_interactions.csv')
df_train.head()

Unnamed: 0,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,num_radical_electrons,...,num_aromatic_heterocycles-fr_allylic_oxid,num_aromatic_heterocycles-num_aromatic_carbocycles,num_aliphatic_heterocycles-feat_251,num_aliphatic_heterocycles-fr_allylic_oxid,num_aliphatic_heterocycles-num_aromatic_carbocycles,feat_251-fr_allylic_oxid,feat_251-num_aromatic_carbocycles,fr_allylic_oxid-num_aromatic_carbocycles,smiles,gap
0,3,0,0,0,0,470.462,470.907296,461.39,130,0,...,0,0,0,0,0,0,0,0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,1.19
1,1,0,0,0,5,352.545,352.085202,336.417,118,0,...,8,2,2,8,2,4,1,4,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.6
2,2,0,0,0,1,399.576,399.032016,386.472,128,0,...,3,6,0,1,2,0,0,2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.49
3,1,0,0,0,4,379.567,379.084867,362.431,128,0,...,0,4,0,0,4,0,0,0,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.36
4,1,0,0,0,0,396.391,396.042944,388.327,136,0,...,0,10,0,0,0,0,0,0,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,1.98


In [3]:
df_test.head()

Unnamed: 0,Id,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,...,num_aromatic_heterocycles-num_aliphatic_heterocycles,num_aromatic_heterocycles-feat_251,num_aromatic_heterocycles-fr_allylic_oxid,num_aromatic_heterocycles-num_aromatic_carbocycles,num_aliphatic_heterocycles-feat_251,num_aliphatic_heterocycles-fr_allylic_oxid,num_aliphatic_heterocycles-num_aromatic_carbocycles,feat_251-fr_allylic_oxid,feat_251-num_aromatic_carbocycles,fr_allylic_oxid-num_aromatic_carbocycles
0,1,2,0,0,0,0,409.499,409.045587,398.411,136,...,0,0,0,10,0,0,0,0,0,0
1,2,0,0,0,0,0,352.469,351.991109,344.405,110,...,0,0,0,5,0,0,0,0,0,0
2,3,1,0,0,0,2,514.569,514.948537,501.465,146,...,4,0,0,12,0,0,3,0,0,0
3,4,2,0,0,0,4,376.491,376.10319,360.363,132,...,3,0,0,6,0,0,2,0,0,0
4,5,3,0,0,0,0,569.637,569.844956,559.557,154,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Drop the 'smiles' and 'Id' columns
df_train = df_train.drop(['smiles'], axis=1)
df_test = df_test.drop(['Id'], axis=1)
df_test = df_test.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
X_test = df_test.values
print "Train features:", X_train.shape, "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (999997, 669) Train gap: (999997,)
Test features: (824230, 669)


In [5]:
# Split training data into training and validation sets as well as begin some k-fold CV
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

#### For classification purposes, round target values to nearest .5

In [6]:
# Round to nearest integer
# cross_Y_train_labels, cross_Y_valid_labels = np.round(cross_Y_train), np.round(cross_Y_valid)
# Y_train_labels = np.round(Y_train)
# Round to nearest .5
cross_Y_train_labels, cross_Y_valid_labels = (((np.round(2*cross_Y_train)/2.0)-0.5)/0.5).astype(int), (((np.round(2*cross_Y_valid)/2.0)-0.5)/0.5).astype(int)
Y_train_labels = (((np.round(2*Y_train)/2.0)-.5)/.5).astype(int)
# Round to nearest .25
# cross_Y_train_labels, cross_Y_valid_labels = (((np.round(4*cross_Y_train)/4.0)-.25)/.25).astype(int), (((np.round(4*cross_Y_valid)/4.0)-.25)/.25).astype(int)
# Y_train_labels = (((np.round(4*Y_train)/4.0)-.25)/.25).astype(int)

In [7]:
print "'Training' features: ", cross_X_train.shape
print "'Validate' features: ", cross_X_valid.shape

'Training' features:  (669997, 669)
'Validate' features:  (330000, 669)


# GOAL:

This notebook is set-up to chain together classification and regression methods. The thought is that we can, after we've trained the two models, to first apply a classifier to the data (in a clustering kind of sense) and then use the category or neighborhood that the sample is assigned as an additional feature to perform regression. Here the category or label will be the closest integer to the gap value. The idea behind this is to hijack the regression into a local region of the expected HOMO-LUMO gap based on the label. The hope is that this will pin the regressor closer to the right value. 

It's imperative that we get as accurate of a classifier as we can.

Fingers crossed.

### First: Let's build a classifier that will adequately label the samples

We'll start with Logistic Regression and try to fit the best model using a collection of C values

In [8]:
%%time

logReg_training_acc = 0
logReg_test_acc = 0

Cs = [0.1, 1.0]

for c in Cs:
    clf_logReg=LogisticRegression(penalty="l2",C=c, solver='lbfgs')
    clf_logReg.fit(cross_X_train,cross_Y_train_labels)
    training_acc = clf_logReg.score(cross_X_train,cross_Y_train_labels)
    test_acc = clf_logReg.score(cross_X_valid,cross_Y_valid_labels)
    print c, test_acc
    if logReg_test_acc < test_acc:
        logReg_test_acc = test_acc
        logReg_training_acc = training_acc
        best_logReg = clf_logReg

0.1 0.595712121212
1.0 0.589433333333
CPU times: user 32min 28s, sys: 25.7 s, total: 32min 54s
Wall time: 17min 21s


In [9]:
logReg_training_acc = best_logReg.score(cross_X_train,cross_Y_train_labels)
logReg_test_acc = best_logReg.score(cross_X_valid,cross_Y_valid_labels)
print "Training Accuracy: %0.3f" % logReg_training_acc
print "Test Accuracy: %0.3f" % logReg_test_acc

Training Accuracy: 0.594
Test Accuracy: 0.596


#### Concatenate predicted labels onto test/validation set

In [10]:
def adding_labels(feat_matrix,labels,label_dim):
    '''Helper function that creates sparse binary array to concatenate to feat_matrix'''
    # Create empty matrix 
    added_cols = np.zeros((labels.shape[0],label_dim))
    # Increment entry that corresponds to the sample having the specified label
    for ii in xrange(labels.shape[0]):
        added_cols[ii,labels[ii]] = 1
    # Concatenate label columns to feat_matrix
    feat_matrix = np.concatenate((feat_matrix,added_cols),axis=1)
    return feat_matrix

In [11]:
if np.max(cross_Y_train_labels) >= np.max(cross_Y_valid_labels):
    max_label = np.max(cross_Y_train_labels)+1
else:
    max_label = np.max(cross_Y_valid_labels)+1

cross_X_train = adding_labels(cross_X_train,cross_Y_train_labels,max_label)

cross_X_valid = adding_labels(cross_X_valid,cross_Y_valid_labels,max_label)

## Now generating a Random Forest Regression

In [12]:
%%time
RF_RMSE = 100
num_estimators = [64, 96, 128]

    
for n_estimators in num_estimators:

    rf_reg = RandomForestRegressor(n_estimators=n_estimators,max_features='sqrt', n_jobs=3)
    rf_reg.fit(cross_X_train,cross_Y_train)
    y_pred = rf_reg.predict(cross_X_valid)

    RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
    if RMSE < RF_RMSE:
        print n_estimators, RMSE
        RF_RMSE = RMSE
        RF_estimators = n_estimators
        best_RF = rf_reg
            
print "RandomForest with {0} estimators had RMSE of {1}".format(RF_estimators,RF_RMSE)        

64 0.098313714104
96 0.0978026272387
RandomForest with 96 estimators had RMSE of 0.0978026272387
CPU times: user 58min 51s, sys: 49 s, total: 59min 40s
Wall time: 21min 29s


## We're also going to tune a typical Linear Regression to have double coverage (either Ridge or Lasso)

In [13]:
%%time
lasso_RMSE = 100
alphas = np.logspace(-4, -1, 5)

for alpha in alphas:
    lasso_reg = Lasso(alpha=alpha)
    lasso_reg.fit(cross_X_train, cross_Y_train)
    y_pred = lasso_reg.predict(cross_X_valid)
    
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid,y_pred))
    if RMSE < lasso_RMSE:
        print alpha, RMSE
        lasso_RMSE = RMSE
        lasso_alpha = alpha
        best_lasso = lasso_reg
        
print "Lasso RMSE: {0} with alpha: {1}".format(lasso_RMSE,lasso_alpha)

0.0001 0.109691148413
Lasso RMSE: 0.109691148413 with alpha: 0.0001
CPU times: user 57min 53s, sys: 1min 3s, total: 58min 57s
Wall time: 59min 17s




In [None]:
# %%time
# ridge_RMSE = 100
# alphas = np.logspace(-4, 1, 30)

# for alpha in alphas:
#     ridge_clf = Ridge(alpha=alpha)
#     ridge_clf.fit(X_train_clf, cross_Y_train)
#     y_pred = ridge_clf.predict(X_valid_clf)
    
#     RMSE = np.sqrt(mean_squared_error(cross_Y_valid,y_pred))
#     if RMSE < ridge_RMSE:
#         ridge_RMSE = RMSE
#         ridge_alpha = alpha
#         best_ridge = ridge_clf
        
# print "Ridge RMSE: {0} with alpha: {1}".format(ridge_RMSE,ridge_alpha)

In [14]:
with open('final2_classifier_and_regressors.pkl','w') as f:
    pickle.dump((best_logReg, best_RF, best_lasso),f)

In [None]:
# with open('final_classifier_and_regressors.pkl','r') as fopen:
#     best_logReg, best_pcaExtraTrees, pcaExtraTrees_components, best_ridge = pickle.load(fopen)
    
# After opening this, you may need to re-configure the test and training set, that is if you have to restart the kernel

## Train on full training set, run on full test set

Here we will train the classifier and the regressions

In [15]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

### Logistic Regression

In [17]:
%%time
# Train classifier
best_logReg.fit(X_train,Y_train_labels)

# Run classifier on test set
label_pred = best_logReg.predict(X_test)

# Concatenate full training labels to full test set
X_train_clf = adding_labels(X_train,Y_train_labels,max_label)

# Concatenate predicted labels onto test set as a new feature
X_test_clf = adding_labels(X_test,label_pred,max_label)

print "Completed Classification of test set"

Completed Classification of test set
CPU times: user 8.72 s, sys: 24.2 s, total: 32.9 s
Wall time: 40.5 s


### Random Forest

In [18]:
%%time
# Train ExtraTrees Regressor
best_RF.fit(X_train_clf,Y_train)

# Run ExtraTrees Regressor
RF_pred = best_RF.predict(X_test_clf)

# Save the ExtraTrees Predictions
write_to_file("RF_wlogReg_FINAL2_TWK_11Feb.csv", RF_pred)

print "Completed Random Forest Regression"

Completed Random Forest Regression
CPU times: user 34min, sys: 45.3 s, total: 34min 45s
Wall time: 13min 10s


### Lasso/Ridge Regression

In [19]:
# # Train Ridge Regression
# best_ridge.fit(X_full_train_clf,Y_train)

# # Run Ridge Regressor
# ridge_pred = best_ridge.predict(X_test_clf)

# # Save the Ridge Predictions
# write_to_file("ridge_FINAL_TWK_10Feb.csv",ridge_pred)
# print "Completed Ridge Regression"

# Train Lasso Regression
best_lasso.fit(X_train_clf,Y_train)

# Run Lasso Regressor
lasso_pred = best_lasso.predict(X_test_clf)

# Save the Lasso Predictions
write_to_file("lasso_wlogReg_FINAL2_TWK_11Feb.csv",lasso_pred)
print "Completed Lasso Regression"

Completed Lasso Regression
