In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoLarsCV, LassoCV, ElasticNetCV, SGDRegressor
from sklearn.linear_model import Ridge, BayesianRidge, LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error

In [3]:
df_train = pd.read_csv('sam_data/rdk_feat_eng_whole_df_train_orig_features.csv')
df_train.head()

Unnamed: 0,smiles,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,3,0,0,0,0,470.462,470.907296,461.39,130,...,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,5,352.545,352.085202,336.417,118,...,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,2,0,0,0,1,399.576,399.032016,386.472,128,...,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,4,379.567,379.084867,362.431,128,...,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,1,0,0,0,0,396.391,396.042944,388.327,136,...,1,0,0,0,0,0,0,0,0,1.98


In [4]:
# Using StandardScaler to normalize non-binary columns
non_binary_cols = ['num_branches','avg_molecular_weight','exact_molecular_weight','avg_molecular_weight_ignore_hydrogen',
                   'num_valence_electrons','num_radical_electrons','formal_charge', 'sssr', 'fraction_csp3',
                   'num_aliphatic_carbocycles', 'num_aliphatic_heterocycles','num_aliphatic_rings', 'num_aromatic_carbocycles',
                   'num_aromatic_heterocycles', 'num_aromatic_rings','num_saturated_heterocycles', 'num_saturated_rings',
                   'num_benzene_rings', 'num_benzodiazepine', 'num_thiophene_rings','num_ketones']
df_train[non_binary_cols] = df_train[non_binary_cols].apply(lambda x: StandardScaler().fit_transform(x))



In [5]:
# Read in training data
# df_train = pd.read_csv('sam_data/rdk_feat_eng_whole_df_train_orig_features.csv')

# Drop the 'smiles' column 
df_train = df_train.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values


# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape

Train features: (999997, 282)
Train gap: (999997,)


In [6]:
# Split training data into training and validation sets as well as begin some k-fold CV
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.4, random_state=42)

In [7]:
Y_clf_train, Y_clf_valid = cross_Y_train.round(), cross_Y_valid.round()

In [None]:
print "'Training' features: ", cross_X_train.shape
print "'Validate' features: ", cross_X_valid.shape

'Training' features:  (599998, 282)
'Validate' features:  (399999, 282)


In [None]:
tester = np.vstack((cross_X_train.T,Y_clf_train)).T
print tester.shape

# GOAL:

This notebook is set-up to chain together classification and regression methods. The thought is that we can, after we've trained the two models, to first apply a classifier to the data (in a clustering kind of sense) and then use the category or neighborhood that the sample is assigned as an additional feature to perform regression. Here the category or label will be the closest integer to the gap value. The idea behind this is to hijack the regression into a local region of the expected HOMO-LUMO gap based on the label. The hope is that this will pin the regressor closer to the right value. 

It's imperative that we get as accurate of a classifier as we can.

Fingers crossed.

### First: Let's build a classifier that will adequately label the samples

We'll start with Logistic Regression and try to fit the best model using GridSearchCV

In [None]:
%%time

clf_logReg=LogisticRegressionCV(n_jobs=2)
clf_logReg.fit(cross_X_train,Y_clf_train)

In [None]:
logReg_training_acc = clf_logReg.score(cross_X_train,Y_clf_train)
logReg_test_acc = clf_logReg.score(cross_X_valid,Y_clf_valid)
print "Training Accuracy: %0.3f" % logReg_training_acc
print "Test Accuracy: %0.3f" % logReg_test_acc