In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv").sample(frac=0.65)
df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
793468,c1sc(c2C=CCc12)-c1cc2cnc3cc4ccccc4cc3c2cn1,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.32
249237,c1ccc([se]1)-c1sc(-c2sc(-c3scc4occc34)c3occc23...,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1.62
932377,[nH]1c2ccc3c[SiH2]cc3c2c2c3c[nH]cc3c3-cccn-c3c12,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.9
435158,c1sc(-c2ncc(s2)-c2ccc(cc2)-c2scc3ccsc23)c2occc12,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,2.16
974836,c1cc2[se]c3c(ccc4cc(-c5nccc6nsnc56)c5c[nH]cc5c...,0,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1.47


In [4]:
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [6]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
793468,c1sc(c2C=CCc12)-c1cc2cnc3cc4ccccc4cc3c2cn1,1,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
249237,c1ccc([se]1)-c1sc(-c2sc(-c3scc4occc34)c3occc23...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
932377,[nH]1c2ccc3c[SiH2]cc3c2c2c3c[nH]cc3c3-cccn-c3c12,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
435158,c1sc(-c2ncc(s2)-c2ccc(cc2)-c2scc3ccsc23)c2occc12,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
974836,c1cc2[se]c3c(ccc4cc(-c5nccc6nsnc56)c5c[nH]cc5c...,0,0,0,0,1,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [9]:
df_train_featsOnly = df_train.drop(['smiles'], axis=1)
aggSum_feats = df_train_featsOnly.sum(axis=1)
df_

In [15]:
aggSum_feats.median()

13.0

In [16]:
aggSum = df_all.sum(axis=1)

In [17]:
aggSum.median()

13.0

In [None]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


## This is a first attempt at some feature engineering. The idea is that the number of  [branches](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system#Branching) influences the gap between HOMO and LUMO levels.

In [18]:
num_branches = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('(')))
df_all['num_branches'] = pd.DataFrame(num_branches)

In [19]:
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,num_branches
793468,c1sc(c2C=CCc12)-c1cc2cnc3cc4ccccc4cc3c2cn1,1,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
249237,c1ccc([se]1)-c1sc(-c2sc(-c3scc4occc34)c3occc23...,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
932377,[nH]1c2ccc3c[SiH2]cc3c2c2c3c[nH]cc3c3-cccn-c3c12,1,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,3
435158,c1sc(-c2ncc(s2)-c2ccc(cc2)-c2scc3ccsc23)c2occc12,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
974836,c1cc2[se]c3c(ccc4cc(-c5nccc6nsnc56)c5c[nH]cc5c...,0,0,0,0,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,2


## This feature is aggregating the number of active features. The idea is that if there are more features who are active in a compound, it may result in higher orbital overlap and thus lower HOMO/LUMO gap

In [20]:
num_active_feat = df_all.sum(axis=1)
df_all['num_active_feat'] = pd.DataFrame(num_active_feat)

In [21]:
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,num_branches,num_active_feat
793468,c1sc(c2C=CCc12)-c1cc2cnc3cc4ccccc4cc3c2cn1,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,14
249237,c1ccc([se]1)-c1sc(-c2sc(-c3scc4occc34)c3occc23...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,10
932377,[nH]1c2ccc3c[SiH2]cc3c2c2c3c[nH]cc3c3-cccn-c3c12,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,3,18
435158,c1sc(-c2ncc(s2)-c2ccc(cc2)-c2scc3ccsc23)c2occc12,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,10
974836,c1cc2[se]c3c(ccc4cc(-c5nccc6nsnc56)c5c[nH]cc5c...,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,2,15


## We can do the same for double, triple and quadruple bonds

In [22]:
num_double_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('=')))
num_triple_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('$')))
num_quad_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('#')))
df_all['num_double_bonds'] = pd.DataFrame(num_double_bonds)
df_all['num_triple_bonds'] = pd.DataFrame(num_triple_bonds)
df_all['num_quad_bonds'] = pd.DataFrame(num_quad_bonds)

In [23]:
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_252,feat_253,feat_254,feat_255,feat_256,num_branches,num_active_feat,num_double_bonds,num_triple_bonds,num_quad_bonds
793468,c1sc(c2C=CCc12)-c1cc2cnc3cc4ccccc4cc3c2cn1,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,14,0,0,0
249237,c1ccc([se]1)-c1sc(-c2sc(-c3scc4occc34)c3occc23...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,10,0,0,0
932377,[nH]1c2ccc3c[SiH2]cc3c2c2c3c[nH]cc3c3-cccn-c3c12,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,3,18,0,0,0
435158,c1sc(-c2ncc(s2)-c2ccc(cc2)-c2scc3ccsc23)c2occc12,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,10,2,0,0
974836,c1cc2[se]c3c(ccc4cc(-c5nccc6nsnc56)c5c[nH]cc5c...,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,2,15,0,0,0


In [24]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (650000, 261)
Train gap: (650000,)
Test features: (824230, 261)


In [25]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

In [26]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

In [27]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [28]:
write_to_file("LR_basicFeatEx_TWK_1Feb.csv", LR_pred)
write_to_file("RF_basicFeatEx_TWK_1Feb.csv", RF_pred)

## The following is simply a test to figure out which features actually contribute to the RF. The idea is to downselect the data frame to use only these features and then train an SVM or something along those lines.

In [29]:
a = RF.feature_importances_

In [30]:
feat_list = np.array(['feat_{0:03d}'.format(i) for i in range(1,257)] + ['num_branches','num_active_feat','num_double_bonds','num_triple_bonds','num_quad_bonds'])
feat_list[a > 0]

array(['feat_001', 'feat_005', 'feat_006', 'feat_007', 'feat_025',
       'feat_037', 'feat_044', 'feat_068', 'feat_069', 'feat_072',
       'feat_087', 'feat_090', 'feat_102', 'feat_119', 'feat_123',
       'feat_126', 'feat_132', 'feat_173', 'feat_176', 'feat_187',
       'feat_196', 'feat_199', 'feat_200', 'feat_208', 'feat_218',
       'feat_225', 'feat_226', 'feat_243', 'feat_248', 'feat_251',
       'feat_252', 'num_branches', 'num_active_feat', 'num_double_bonds'], 
      dtype='|S16')

In [None]:
reducedf_RFfeatimport = df_all[feat_list[a>0]]

## The idea here is that I'll train a type of SVM used for regression purposes a grid search cross-validation on top of it so as to get the best RMSE using that method.

### I'm using the set-up used in HW 3 from data science.

In [None]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV


In [None]:
svr=SVR()
# Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
# Cs=[0.001, 0.01, 0.1, 1.0]
Cs=[1.0]
kernels = ['linear', 'rbf']
eps = [0.05, 0.1, 0.15, 0.2]
vals = reducedf_RFfeatimport.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]

In [None]:
# parameters = {"C": Cs, 'kernel': kernels, 'epsilon': eps}
parameters = {"C": Cs}
n_jobs, n_folds = 3, 5
gs = GridSearchCV(svr, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X_train, Y_train)
print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
best = gs.best_estimator_
best = best.fit(X_train, Y_train)

In [None]:
SVR_1stpass = best.predict(X_test)

In [None]:
write_to_file("SVR_1Feb16.csv", SVR_1stpass)