In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessRegressor

In [48]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [13]:
print df_train.head()

                                              smiles  feat_001  feat_002  \
0  c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...       0.0       0.0   
1  C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...       1.0       0.0   
2  [nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...       1.0       0.0   
3  [nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...       1.0       0.0   
4     c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1       0.0       0.0   

   feat_003  feat_004  feat_005  feat_006  feat_007  feat_008  feat_009  ...   \
0       0.0       0.0       1.0       0.0       1.0       0.0       0.0  ...    
1       0.0       0.0       1.0       0.0       1.0       0.0       0.0  ...    
2       0.0       0.0       1.0       1.0       1.0       0.0       0.0  ...    
3       0.0       0.0       1.0       1.0       1.0       0.0       0.0  ...    
4       0.0       0.0       1.0       0.0       1.0       0.0       0.0  ...    

   feat_248  feat_249  feat_250  feat_251  feat_252  fea

In [14]:
print df_test.head()

   Id                                             smiles  feat_001  feat_002  \
0   1  c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...       0.0       0.0   
1   2         [nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1       0.0       0.0   
2   3  [nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...       1.0       0.0   
3   4  [nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...       1.0       0.0   
4   5  c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...       0.0       0.0   

   feat_003  feat_004  feat_005  feat_006  feat_007  feat_008    ...     \
0       0.0       0.0       1.0       1.0       1.0       0.0    ...      
1       0.0       0.0       1.0       1.0       1.0       0.0    ...      
2       0.0       0.0       1.0       1.0       1.0       0.0    ...      
3       0.0       0.0       1.0       1.0       1.0       0.0    ...      
4       0.0       0.0       1.0       0.0       1.0       0.0    ...      

   feat_247  feat_248  feat_249  feat_250  feat_251  feat_252  feat_

In [3]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [4]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df_all = df_all.loc[:, (df_all != 0).any(axis=0)]
df_all.head()

Unnamed: 0,feat_001,feat_005,feat_006,feat_007,feat_025,feat_037,feat_044,feat_068,feat_069,feat_072,...,feat_199,feat_200,feat_208,feat_218,feat_225,feat_226,feat_243,feat_248,feat_251,feat_252
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
3,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [7]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [16]:
#Drop the 'smiles' column
#df_all = df_all.drop(['smiles'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (1000000, 31)
Train gap: (1000000,)
Test features: (824230, 31)


In [34]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
Y_LR = LR.predict(X_train)
print mean_squared_error(Y_train,Y_LR)
LR_pred = LR.predict(X_test)

0.0893555463894


In [35]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
Y_RF = RF.predict(X_train)
print mean_squared_error(Y_train,Y_RF)
RF_pred = RF.predict(X_test)

0.0739161346189


In [37]:
lasso = LassoCV()
lasso.fit(X_train, Y_train)
Y_lasso = lasso.predict(X_train)
print mean_squared_error(Y_train,Y_lasso)
lasso_pred = lasso.predict(X_test)

0.0893866426017


In [38]:
rcv = RidgeCV()
rcv.fit(X_train, Y_train)
Y_rcv = rcv.predict(X_train)
print mean_squared_error(Y_train,Y_rcv)
rcv_pred = rcv.predict(X_test)

0.08935533244


In [32]:
eln = ElasticNet()
eln.fit(X_train, Y_train)
Y_eln = rcv.predict(X_train)
print mean_squared_error(Y_train,Y_eln)
eln_pred = eln.predict(X_test)

0.08935533244


In [45]:
mlp = MLPRegressor()
mlp.fit(X_train, Y_train)
Y_mlp = mlp.predict(X_train)
print mean_squared_error(Y_train,Y_mlp)
mlp_pred = mlp.predict(X_test)

0.0755022066031


In [49]:
br = BaggingRegressor()
br.fit(X_train, Y_train)
Y_br = br.predict(X_train)
print mean_squared_error(Y_train,Y_br)
br_pred = br.predict(X_test)

0.073916602188


In [None]:
svr = SVR()
svr.fit(X_train, Y_train)
Y_svr = svr.predict(X_train)
print mean_squared_error(Y_train,Y_svr)
svr_pred = svr.predict(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
Y_gnb = gnb.predict(X_train)
print mean_squared_error(Y_train,Y_gnb)
gnb_pred = gnb.predict(X_test)

In [None]:
gpr = GaussianProcessRegressor()
gpr.fit(X_train, Y_train)
Y_gpr = gpr.predict(X_train)
print mean_squared_error(Y_train,Y_gpr)
gpr_pred = gpr.predict(X_test)

In [39]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("LR.csv", LR_pred)
write_to_file("RF.csv", RF_pred)
write_to_file("lassocv.csv", lasso_pred)
write_to_file("rcv.csv", rcv_pred)
write_to_file("eln.csv", eln_pred)
write_to_file("mlp.csv", mlp_pred)
write_to_file("br.csv", br_pred)
write_to_file("svr.csv", svr_pred)
write_to_file("gnb.csv", gnb_pred)
write_to_file("gpr.csv", gpr_pred)

0.0757787757055
