In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import rdkit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from rdkit import Chem

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
print df_train.head()

                                              smiles  feat_001  feat_002  \
0  c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...       0.0       0.0   
1  C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...       1.0       0.0   
2  [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...       1.0       0.0   
3  [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...       1.0       0.0   
4     c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1       0.0       0.0   

   feat_003  feat_004  feat_005  feat_006  feat_007  feat_008  feat_009  ...   \
0       0.0       0.0       1.0       0.0       1.0       0.0       0.0  ...    
1       0.0       0.0       1.0       0.0       1.0       0.0       0.0  ...    
2       0.0       0.0       1.0       1.0       1.0       0.0       0.0  ...    
3       0.0       0.0       1.0       1.0       1.0       0.0       0.0  ...    
4       0.0       0.0       1.0       0.0       1.0       0.0       0.0  ...    

   feat_248  feat_249  feat_250  feat_251  feat_252  fea

In [4]:
print df_test.head()

   Id                                             smiles  feat_001  feat_002  \
0   1  c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...       0.0       0.0   
1   2         [nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1       0.0       0.0   
2   3  [nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...       1.0       0.0   
3   4  [nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...       1.0       0.0   
4   5  c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...       0.0       0.0   

   feat_003  feat_004  feat_005  feat_006  feat_007  feat_008    ...     \
0       0.0       0.0       1.0       1.0       1.0       0.0    ...      
1       0.0       0.0       1.0       1.0       1.0       0.0    ...      
2       0.0       0.0       1.0       1.0       1.0       0.0    ...      
3       0.0       0.0       1.0       1.0       1.0       0.0    ...      
4       0.0       0.0       1.0       0.0       1.0       0.0    ...      

   feat_247  feat_248  feat_249  feat_250  feat_251  feat_252  feat_

In [6]:
#row where testing examples start
test_idx = df_train.shape[0]
val_len = 100000
#store gap values
yvals = df_train.gap.values
yvals = yvals.astype('float')
Y_train = yvals[:(test_idx-val_len)]
Y_val = yvals[(test_idx-val_len):test_idx]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [7]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
def write_smiles(filename, smiles):
    with open(filename, "w") as f:
        f.write("smiles\n")
        for idx, element in enumerate(smiles):
            f.write(element + "\n")

In [9]:
#store smiles values
all_smiles = df_all.smiles.values
write_smiles("smiles.csv", all_smiles)

In [10]:
df_all = df_all.loc[:, (df_all != 0).any(axis=0)]
df_all.head()

Unnamed: 0,smiles,feat_001,feat_005,feat_006,feat_007,feat_025,feat_037,feat_044,feat_068,feat_069,...,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [107]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [11]:
#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)

In [12]:
#create the training, validation, and test sets
vals = df_all.values
X_train = vals[:(test_idx-val_len)]
X_val = vals[(test_idx-val_len):test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Validation features:", X_val.shape
print "Train gap:", Y_train.shape
print "Validation gap:", Y_val.shape
print "Test features:", X_test.shape

Train features: (900000, 31)
Validation features: (100000, 31)
Train gap: (900000,)
Validation gap: (100000,)
Test features: (824230, 31)


In [14]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
Y_LR = LR.predict(X_val)
print mean_squared_error(Y_val,Y_LR)
LR_pred = LR.predict(X_test)

0.0905474359092


In [15]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
Y_RF = RF.predict(X_val)
print mean_squared_error(Y_val,Y_RF)
RF_pred = RF.predict(X_test)

0.0754185281925


In [16]:
lasso = LassoCV()
lasso.fit(X_train, Y_train)
Y_lasso = lasso.predict(X_val)
print mean_squared_error(Y_val,Y_lasso)
lasso_pred = lasso.predict(X_test)

0.0905532104886


In [17]:
rcv = RidgeCV()
rcv.fit(X_train, Y_train)
Y_rcv = rcv.predict(X_val)
print mean_squared_error(Y_val,Y_rcv)
rcv_pred = rcv.predict(X_test)

0.0905462157332


In [18]:
eln = ElasticNet()
eln.fit(X_train, Y_train)
Y_eln = rcv.predict(X_val)
print mean_squared_error(Y_val,Y_eln)
eln_pred = eln.predict(X_test)

0.0905462157332


In [19]:
mlp = MLPRegressor()
mlp.fit(X_train, Y_train)
Y_mlp = mlp.predict(X_val)
print mean_squared_error(Y_val,Y_mlp)
mlp_pred = mlp.predict(X_test)

0.0777140439608


In [20]:
br = BaggingRegressor()
br.fit(X_train, Y_train)
Y_br = br.predict(X_val)
print mean_squared_error(Y_val,Y_br)
br_pred = br.predict(X_test)

0.0754028369724


In [21]:
boost = GradientBoostingRegressor()
boost.fit(X_train, Y_train)
Y_boost = boost.predict(X_val)
print mean_squared_error(Y_val,Y_boost)
boost_pred = boost.predict(X_test)

0.0828431274974


In [None]:
#svr = SVR()
#svr.fit(X_train, Y_train)
#Y_svr = svr.predict(X_val)
#print mean_squared_error(Y_val,Y_svr)
#svr_pred = svr.predict(X_test)

In [16]:
#gnb = GaussianNB()
#gnb.fit(X_train[0:25,:], Y_train[0:25])
#Y_gnb = gnb.predict(X_val)
#print mean_squared_error(Y_val,Y_gnb)
#gnb_pred = gnb.predict(X_test)

In [21]:
#gpr = GaussianProcessRegressor()
#gpr.fit(X_train, Y_train)
#Y_gpr = gpr.predict(X_val)
#print mean_squared_error(Y_val,Y_gpr)
#gpr_pred = gpr.predict(X_test)

In [112]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [113]:
write_to_file("LR.csv", LR_pred)
write_to_file("RF.csv", RF_pred)
write_to_file("lassocv.csv", lasso_pred)
write_to_file("rcv.csv", rcv_pred)
write_to_file("eln.csv", eln_pred)
write_to_file("mlp.csv", mlp_pred)
write_to_file("br.csv", br_pred)
write_to_file("boost.csv", boost_pred)
#write_to_file("svr.csv", svr_pred)
#write_to_file("gnb.csv", gnb_pred)
#write_to_file("gpr.csv", gpr_pred)

In [9]:
small_df_train = df_train.loc[:, (df_train != 0).any(axis=0)]
small_df_train.head()

Unnamed: 0,smiles,feat_001,feat_005,feat_006,feat_007,feat_025,feat_037,feat_044,feat_068,feat_069,...,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [17]:
small_df_test = df_test.loc[:, (df_test != 0).any(axis=0)]
small_df_test.head()

Unnamed: 0,smiles,feat_001,feat_005,feat_006,feat_007,feat_025,feat_037,feat_044,feat_068,feat_069,...,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
1,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
3,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
4,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [50]:
small_df_all = df_all.loc[:, (df_all != 0).any(axis=0)]
a = small_df_all.head()

In [22]:
test_idx

1000000

In [68]:
a.iloc[0:25,-25:]

Unnamed: 0,feat_044,feat_068,feat_069,feat_072,feat_087,feat_090,feat_102,feat_119,feat_123,feat_126,...,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
