In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNetCV, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from rdkit import DataStructs, Chem
from rdkit.Chem import AllChem, Descriptors, Lipinski, Fragments, rdmolops
from rdkit.ML.Cluster import Butina

In [None]:
"""
Read in training data as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")


In [None]:
df_train.head()

In [None]:
plt.hist(df_train.gap.values, bins=50)

### Note: I see some outliers around -1.5.  Having negative values makes no sense, since the gap is highest - lowest -- they must be misrecorded.  ==> Remove them

In [None]:
# Use Boxplot Rule to detect outliers
def detect_outliers(Y):
    # Takes a np.array as a parameter and returns the indices of the outliers in that array
    q1 = np.percentile(Y, 25)
    q3 = np.percentile(Y, 75)
    iqr = q3 - q1
    out1 = Y < (q1 - 1.5 * iqr)
    out2 = Y > (q3 + 1.5 * iqr)
    return np.logical_or(out1,out2)

In [None]:
# Detect outliers
#is_outlier = detect_outliers(df_train.gap.values)
#print "Number of outliers: {}".format(sum(is_outlier))
#is_outlier

In [None]:
#Remove outliers
#df_train = df_train[~is_outlier]

In [None]:
# Remove negative HOMO-LUMO gaps
df_train = df_train[df_train['gap'] > 0]

In [None]:
print len(df_train)

In [None]:
plt.hist(df_train.gap.values, bins=50)

In [None]:
df_train.head()

In [None]:
df_train = df_train.drop(df_train.columns[range(1,257)], axis=1)

In [None]:
df_train.head()

In [None]:
df_train.to_csv('just_smiles_and_gaps_df_train.csv', index=False)

In [None]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
#df_all = pd.concat((df_train, df_test), axis=0)
#df_all.head()

In [None]:
"""
Example Feature Engineering

this calculates the length of each smile string and adds a feature column with those lengths
Note: this is NOT a good feature and will result in a lower score!
"""
#smiles_len = np.vstack(df_all.smiles.astype(str).apply(lambda x: len(x)))
#df_all['smiles_len'] = pd.DataFrame(smiles_len)


# Additional Feature Engineering
(http://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/). 

## Number of Branches
The idea is that the number of  [branches](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system#Branching) influences the gap between HOMO and LUMO levels.

#### Note: We pull the specific compounds of intereste from [this poster.](http://digitalscholarship.unlv.edu/cgi/viewcontent.cgi?article=1036&context=focs_ug_research)

## Number of Active Features 
The idea is that if there are more features who are active in a compound, it may result in higher orbital overlap and thus lower HOMO/LUMO gap

## Benzene Ring
The last feature is determining, from the SMILES encoding whether or not there is a benzene ring in the compound. Benzene rings are held together with pi bonds which are more conjugated (which means they are closer together in energy)

## Number of Double, Triple, and Quadruple Bonds

In [None]:
# Just read in the smiles and gaps
df_train = pd.read_csv('just_smiles_and_gaps_df_train.csv')

In [2]:
num_parts = 10

In [None]:
df_train_parts = np.array_split(df_train, num_parts)
for i in xrange(len(df_train_parts)):
    df_train_parts[i].to_csv('just_smiles_and_gaps_df_train_part_'+str(i)+'.csv', index=False)

In [3]:
for i in xrange(num_parts):
    df_train = pd.read_csv('just_smiles_and_gaps_df_train_part_'+str(i)+'.csv')
    # Apply feature engineering to train data
    num_branches = df_train.smiles.apply(lambda x: x.count('('))
    #num_active_feat = df_train.sum(axis=0)
    #has_benzene_ring = df_train.smiles.apply(lambda x: min(1,x.count('c1ccccc1')))
    #has_thiophene = df_train.smiles.apply(lambda x: min(1,x.count('c1ccsc1')))
    has_benzothiophene = df_train.smiles.apply(lambda x: min(1,x.count('s2c1ccccc1cc2')))
    #has_dibenzothiophene = df_train.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3s2')))
    has_carbazole = df_train.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3[nH]2')))
    has_fluorene = df_train.smiles.apply(lambda x: min(1,x.count('c1ccc-2c(c1)Cc3c2cccc3')))
    num_double_bonds = df_train.smiles.apply(lambda x: x.count('='))
    # num_triple_bonds = np.vstack(df_train.smiles.astype(str).apply(lambda x: x.count('$')))
    # num_quad_bonds = np.vstack(df_train.smiles.astype(str).apply(lambda x: x.count('#')))
    df_train['num_branches'] = num_branches
    #df_train['num_active_feat'] = num_active_feat
    #df_train['has_benzene_ring'] = has_benzene_ring
    #df_train['has_thiophene'] = has_thiophene
    df_train['has_benzothiophene'] = has_benzothiophene
    #df_train['has_dibenzothiophene'] = has_dibenzothiophene
    df_train['has_carbazole'] = has_carbazole
    df_train['has_fluorene'] = has_fluorene
    df_train['num_double_bonds'] = num_double_bonds
    print "Part {}: finished initial feat gen".format(i)
    # RDKit Feature Engineering
    # Generate molecule objects
    molecules = df_train.smiles.apply(lambda x: Chem.MolFromSmiles(x))
    print "Part {}: finished molecule generation".format(i)
    # Generate Features
    df_train['avg_molecular_weight'] = molecules.apply(lambda x: Descriptors.MolWt(x))
    print "Part {}: finished first feature".format(i)
    df_train['exact_molecular_weight'] = molecules.apply(lambda x: Descriptors.ExactMolWt(x))
    df_train['avg_molecular_weight_ignore_hydrogen'] = molecules.apply(lambda x: Descriptors.HeavyAtomMolWt(x))
    df_train['num_valence_electrons'] = molecules.apply(lambda x: Descriptors.NumValenceElectrons(x))
    df_train['num_radical_electrons'] = molecules.apply(lambda x: Descriptors.NumRadicalElectrons(x))
    df_train['formal_charge'] = molecules.apply(lambda x: rdmolops.GetFormalCharge(x))
    df_train['sssr'] = molecules.apply(lambda x: rdmolops.GetSSSR(x))
    print "Part {}: finished Descriptors and rdmolops".format(i)
    df_train['fraction_csp3'] = molecules.apply(lambda x: Lipinski.FractionCSP3(x))
    df_train['num_aliphatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticCarbocycles(x))
    df_train['num_aliphatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticHeterocycles(x))
    df_train['num_aliphatic_rings'] = molecules.apply(lambda x: Lipinski.NumAliphaticRings(x))
    df_train['num_aromatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticCarbocycles(x))
    df_train['num_aromatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticHeterocycles(x))
    df_train['num_aromatic_rings'] = molecules.apply(lambda x: Lipinski.NumAromaticRings(x))
    df_train['num_saturated_carbocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedCarbocycles(x))
    df_train['num_saturated_heterocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedHeterocycles(x))
    df_train['num_saturated_rings'] = molecules.apply(lambda x: Lipinski.NumSaturatedRings(x))
    print "Part {}: finished Lipinski".format(i)
    df_train['num_benzene_rings'] = molecules.apply(lambda x: Fragments.fr_benzene(x))
    df_train['num_benzodiazepine'] =  molecules.apply(lambda x: Fragments.fr_benzodiazepine(x))
    df_train['num_thiophene_rings'] = molecules.apply(lambda x: Fragments.fr_thiophene(x))
    df_train['num_ketones'] = molecules.apply(lambda x: Fragments.fr_ketone_Topliss(x))
    print "Part {}: finished Fragments".format(i)
    df_train.to_csv('rdk_feat_eng_df_train_part_'+str(i)+'.csv', index=False)
    del df_train
    del molecules
    

Part 0: finished initial feat gen
Part 0: finished molecule generation
Part 0: finished first feature
Part 0: finished Descriptors and rdmolops
Part 0: finished Lipinski
Part 0: finished Fragments
Part 1: finished initial feat gen
Part 1: finished molecule generation
Part 1: finished first feature
Part 1: finished Descriptors and rdmolops
Part 1: finished Lipinski
Part 1: finished Fragments
Part 2: finished initial feat gen
Part 2: finished molecule generation
Part 2: finished first feature
Part 2: finished Descriptors and rdmolops
Part 2: finished Lipinski
Part 2: finished Fragments
Part 3: finished initial feat gen
Part 3: finished molecule generation
Part 3: finished first feature
Part 3: finished Descriptors and rdmolops
Part 3: finished Lipinski
Part 3: finished Fragments
Part 4: finished initial feat gen
Part 4: finished molecule generation
Part 4: finished first feature
Part 4: finished Descriptors and rdmolops
Part 4: finished Lipinski
Part 4: finished Fragments
Part 5: finishe

# Test Data Prep

In [76]:
# Read in test data
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,Id,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,1,0,0,0,1,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [77]:
df_test = df_test.drop(df_test.columns[range(2,258)], axis=1)

In [78]:
df_test.head()

Unnamed: 0,Id,smiles
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...


In [79]:
# Save df with smiles and ids
df_test.to_csv('just_smiles_and_ids_df_test.csv', index=False)

In [None]:
# Just read in the smiles and ids
df_test = pd.read_csv('just_smiles_and_ids_df_test.csv')

In [80]:
num_parts = 10

In [81]:
df_test_parts = np.array_split(df_test, num_parts)
for i in xrange(num_parts):
    df_test_parts[i].to_csv('just_smiles_and_ids_df_test_part_'+str(i)+'.csv', index=False)

In [83]:
for i in xrange(num_parts):
    df_test = pd.read_csv('just_smiles_and_ids_df_test_part_'+str(i)+'.csv')
    # Apply feature engineering to train data
    num_branches = df_test.smiles.apply(lambda x: x.count('('))
    #num_active_feat = df_test.sum(axis=0)
    #has_benzene_ring = df_test.smiles.apply(lambda x: min(1,x.count('c1ccccc1')))
    #has_thiophene = df_test.smiles.apply(lambda x: min(1,x.count('c1ccsc1')))
    has_benzothiophene = df_test.smiles.apply(lambda x: min(1,x.count('s2c1ccccc1cc2')))
    #has_dibenzothiophene = df_test.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3s2')))
    has_carbazole = df_test.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3[nH]2')))
    has_fluorene = df_test.smiles.apply(lambda x: min(1,x.count('c1ccc-2c(c1)Cc3c2cccc3')))
    num_double_bonds = df_test.smiles.apply(lambda x: x.count('='))
    # num_triple_bonds = np.vstack(df_test.smiles.astype(str).apply(lambda x: x.count('$')))
    # num_quad_bonds = np.vstack(df_test.smiles.astype(str).apply(lambda x: x.count('#')))
    df_test['num_branches'] = num_branches
    #df_test['num_active_feat'] = num_active_feat
    #df_test['has_benzene_ring'] = has_benzene_ring
    #df_test['has_thiophene'] = has_thiophene
    df_test['has_benzothiophene'] = has_benzothiophene
    #df_test['has_dibenzothiophene'] = has_dibenzothiophene
    df_test['has_carbazole'] = has_carbazole
    df_test['has_fluorene'] = has_fluorene
    df_test['num_double_bonds'] = num_double_bonds
    print "Part {}: finished initial feat gen".format(i)
    # RDKit Feature Engineering
    # Generate molecule objects
    molecules = df_test.smiles.apply(lambda x: Chem.MolFromSmiles(x))
    print "Part {}: finished molecule generation".format(i)
    # Generate Features
    df_test['avg_molecular_weight'] = molecules.apply(lambda x: Descriptors.MolWt(x))
    print "Part {}: finished first feature".format(i)
    df_test['exact_molecular_weight'] = molecules.apply(lambda x: Descriptors.ExactMolWt(x))
    df_test['avg_molecular_weight_ignore_hydrogen'] = molecules.apply(lambda x: Descriptors.HeavyAtomMolWt(x))
    df_test['num_valence_electrons'] = molecules.apply(lambda x: Descriptors.NumValenceElectrons(x))
    df_test['num_radical_electrons'] = molecules.apply(lambda x: Descriptors.NumRadicalElectrons(x))
    df_test['formal_charge'] = molecules.apply(lambda x: rdmolops.GetFormalCharge(x))
    df_test['sssr'] = molecules.apply(lambda x: rdmolops.GetSSSR(x))
    print "Part {}: finished Descriptors and rdmolops".format(i)
    df_test['fraction_csp3'] = molecules.apply(lambda x: Lipinski.FractionCSP3(x))
    df_test['num_aliphatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticCarbocycles(x))
    df_test['num_aliphatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticHeterocycles(x))
    df_test['num_aliphatic_rings'] = molecules.apply(lambda x: Lipinski.NumAliphaticRings(x))
    df_test['num_aromatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticCarbocycles(x))
    df_test['num_aromatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticHeterocycles(x))
    df_test['num_aromatic_rings'] = molecules.apply(lambda x: Lipinski.NumAromaticRings(x))
    df_test['num_saturated_carbocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedCarbocycles(x))
    df_test['num_saturated_heterocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedHeterocycles(x))
    df_test['num_saturated_rings'] = molecules.apply(lambda x: Lipinski.NumSaturatedRings(x))
    print "Part {}: finished Lipinski".format(i)
    df_test['num_benzene_rings'] = molecules.apply(lambda x: Fragments.fr_benzene(x))
    df_test['num_benzodiazepine'] =  molecules.apply(lambda x: Fragments.fr_benzodiazepine(x))
    df_test['num_thiophene_rings'] = molecules.apply(lambda x: Fragments.fr_thiophene(x))
    df_test['num_ketones'] = molecules.apply(lambda x: Fragments.fr_ketone_Topliss(x))
    print "Part {}: finished Fragments".format(i)
    df_test.to_csv('rdk_feat_eng_df_test_part_'+str(i)+'.csv', index=False)
    del df_test
    del molecules

Part 0: finished initial feat gen
Part 0: finished molecule generation
Part 0: finished first feature
Part 0: finished Descriptors and rdmolops
Part 0: finished Lipinski
Part 0: finished Fragments
Part 1: finished initial feat gen
Part 1: finished molecule generation
Part 1: finished first feature
Part 1: finished Descriptors and rdmolops
Part 1: finished Lipinski
Part 1: finished Fragments
Part 2: finished initial feat gen
Part 2: finished molecule generation
Part 2: finished first feature
Part 2: finished Descriptors and rdmolops
Part 2: finished Lipinski
Part 2: finished Fragments
Part 3: finished initial feat gen
Part 3: finished molecule generation
Part 3: finished first feature
Part 3: finished Descriptors and rdmolops
Part 3: finished Lipinski
Part 3: finished Fragments
Part 4: finished initial feat gen
Part 4: finished molecule generation
Part 4: finished first feature
Part 4: finished Descriptors and rdmolops
Part 4: finished Lipinski
Part 4: finished Fragments
Part 5: finishe

In [84]:
# read in each feat eng part
df_test_parts = [pd.read_csv('rdk_feat_eng_df_test_part_'+str(i)+'.csv') for i in xrange(num_parts)]


In [85]:
# combine into one df
df_test = pd.concat(df_test_parts)

In [86]:
#Read in old training data and merge with new features
df_test_old = pd.read_csv("test.csv")

In [87]:
df_test = df_test.merge(df_test_old, on=["smiles"])

In [88]:
del df_test_old
del df_test_parts

In [90]:
df_test

Unnamed: 0,Id_x,smiles,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,1,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,2,0,0,0,0,409.499,409.045587,398.411,...,0,1,0,0,0,0,0,0,0,0
1,2,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,0,352.469,351.991109,344.405,...,0,1,0,0,0,0,0,0,0,0
2,3,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,2,514.569,514.948537,501.465,...,0,1,0,0,0,0,0,0,0,0
3,4,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,2,0,0,0,4,376.491,376.103190,360.363,...,0,1,0,0,0,0,0,0,0,0
4,5,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,3,0,0,0,0,569.637,569.844956,559.557,...,0,1,0,0,0,0,0,0,0,0
5,6,c1cc2sc3c4[SiH2]C(=Cc4ncc3c2s1)c1cccnc1,1,0,0,0,1,322.490,322.005467,312.410,...,0,1,0,0,1,0,0,0,0,0
6,7,[nH]1cccc1-c1cc2c3c[nH]cc3c3c4CC=Cc4[nH]c3c2c2...,0,0,0,0,1,349.393,349.121512,334.273,...,0,1,0,0,0,0,0,0,0,0
7,8,C1=CC=C([SiH2]1)c1cc2ncc3c4cnccc4c4=CCC=c4c3c2...,1,0,0,0,4,388.502,388.103190,372.374,...,0,1,0,0,1,0,0,0,0,0
8,9,c1sc(-c2sc(-c3ccc(cc3)-c3scc4nccnc34)c3cc[se]c...,3,0,0,0,0,531.549,531.938931,519.453,...,0,1,0,0,0,0,0,0,0,0
9,10,[nH]1c(cc2c3c[nH]cc3c3c4occc4ncc3c12)-c1scc2[n...,1,0,0,0,0,368.421,368.073182,356.325,...,0,1,0,0,0,0,0,0,0,0


In [91]:
df_test = df_test.drop(['Id_x'],axis=1)
df_test = df_test.rename(columns = {'Id_y':'Id'})
df_test.to_csv('rdk_feat_eng_whole_df_test_orig_features.csv', index=False)

# Partition Training Data: Train model on part, validate on part

In [None]:
# Read in feature engineered data without outliers
#df_train = pd.read_csv("feat_eng_df_train_no_outliers.csv")

In [39]:
df_train_parts = [pd.read_csv('rdk_feat_eng_df_train_part_'+str(i)+'.csv') for i in xrange(num_parts)]

In [47]:
df_train = pd.concat(df_train_parts)

In [48]:
#df_train = df_train[df_train.columns[range(0,14)+range(15,29)]]

In [49]:
df_train.iloc[0]

smiles                                  c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...
gap                                                                                  1.19
num_branches                                                                            3
has_benzothiophene                                                                      0
has_carbazole                                                                           0
has_fluorene                                                                            0
num_double_bonds                                                                        0
avg_molecular_weight                                                              470.462
exact_molecular_weight                                                            470.907
avg_molecular_weight_ignore_hydrogen                                               461.39
num_valence_electrons                                                                 130
num_radica

In [43]:
df_train.to_csv('rdk_feat_eng_whole_df_train.csv', index=False)

In [53]:
len(df_train)

999997

In [52]:
#Read in old training data and merge with new features
df_train_old = pd.read_csv("train.csv")

In [62]:
df_train = df_train.merge(df_train_old, on=["smiles"])

In [63]:
df_train.head()

Unnamed: 0,smiles,gap_x,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap_y
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,1.19,3,0,0,0,0,470.462,470.907296,461.39,...,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.6,1,0,0,0,5,352.545,352.085202,336.417,...,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.49,2,0,0,0,1,399.576,399.032016,386.472,...,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.36,1,0,0,0,4,379.567,379.084867,362.431,...,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,1.98,1,0,0,0,0,396.391,396.042944,388.327,...,1,0,0,0,0,0,0,0,0,1.98


In [64]:
df_train = df_train.drop(['gap_x'],axis=1)
df_train = df_train.rename(columns = {'gap_y':'gap'})
df_train.to_csv('rdk_feat_eng_whole_df_train_orig_features.csv', index=False)

In [65]:
del df_train_old
del df_train_parts

In [93]:
df_train = pd.read_csv('rdk_feat_eng_whole_df_train_orig_features.csv')

In [94]:
df_train.head()

Unnamed: 0,smiles,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,3,0,0,0,0,470.462,470.907296,461.39,130,...,1,0,0,0,0,0,0,0,0,1.19
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,5,352.545,352.085202,336.417,118,...,1,0,0,1,0,0,0,0,0,1.6
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,2,0,0,0,1,399.576,399.032016,386.472,128,...,1,0,0,0,1,0,0,0,0,1.49
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,4,379.567,379.084867,362.431,128,...,1,0,0,0,1,0,0,0,0,1.36
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,1,0,0,0,0,396.391,396.042944,388.327,136,...,1,0,0,0,0,0,0,0,0,1.98


In [95]:
# Drop the 'smiles' column 
df_train = df_train.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape

Train features: (999997, 282)
Train gap: (999997,)


In [68]:
print X_train[0]

[   3.            0.            0.            0.            0.          470.462
  470.90729621  461.39        130.            0.            0.            6.
    0.            0.            0.            0.            0.            6.
    6.            0.            0.            0.            0.            0.
    2.            0.            0.            0.            0.            0.
    1.            0.            1.            0.            0.            0.
    0.            0.            0.            0.            0.            0.
    0.            0.            0.            0.            0.            0.
    0.            0.            0.            0.            0.            0.
    0.            0.            0.            0.            0.            0.
    0.            0.            0.            0.            0.            0.
    0.            0.            0.            0.            0.            0.
    0.            0.            0.            0.            0.           

In [96]:
# Partition Training Data into Training, Validation
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [97]:
# Fit Linear Regression to cross_X_train and validate it on validations set
LR = LinearRegression()
LR.fit(cross_X_train, cross_Y_train)
LR_pred = LR.predict(cross_X_valid)

In [98]:
LR.coef_
zero_coefs = []
for i in xrange(len(LR.coef_)):
    if LR.coef_[i] == 0:
        zero_coefs.append(i)
print "Number of coefficients that are zero:", len(zero_coefs)
print "Total number of coefficients:", len(LR.coef_)
print zero_coefs

Number of coefficients that are zero: 198
Total number of coefficients: 282
[58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 149, 150, 152, 153, 154, 155, 156, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 213, 214, 215, 216, 217, 218, 219, 220, 222, 223, 226, 227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 269, 270, 271, 272, 

In [99]:
print LR_pred

[ 1.60670809  2.18476693  1.86685035 ...,  1.8073974   1.79477373
  2.0821052 ]


In [107]:
mean_squared_error(cross_Y_valid, LR_pred)**.5

0.22342898938675251

In [101]:
lasso_est = Lasso()
lasso_est.fit(cross_X_train, cross_Y_train)
lasso_pred = lasso_est.predict(cross_X_valid)

In [106]:
mean_squared_error(cross_Y_valid, lasso_pred)**.5

0.36328567992445132

In [None]:
df_train.head()

### Results
Raw Data (no added features):

    Validation RMSE:
    
        Without filtering outliers: 0.089523360956744222

        Filtering outliers: 0.088030185268268696
    
Using Feature Engineering (from Taylor):

    Validation RMSE:
        
        Without filtering outliers: 0.073494468814734482
        
        Filtering outliers: 0.072327859031385167
        
Using RDK Feature Eng: .0499

In [108]:
# Remove cross validation datasets
del cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid

# Train LR model using entire training dataset (feat eng, without outliers)

In [None]:
# Fit Linear Regression to cross_X_train and validate it on validations set
LR2 = LinearRegression()
LR2.fit(X_train, Y_train)

In [75]:
# Remove training data from memory
del df_train, X_train, Y_train

In [None]:
print len(X_train), len(Y_train)

## Predict on Test Data

In [109]:
# save feature engineered test data
df_test = pd.read_csv('rdk_feat_eng_whole_df_test_orig_features.csv')
df_test.head()

Unnamed: 0,smiles,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1sc(-c2cnc3c(c2)c2nsnc2c2cc4cccnc4cc32)c2cc[n...,2,0,0,0,0,409.499,409.045587,398.411,136,...,0,1,0,0,0,0,0,0,0,0
1,[nH]1cccc1-c1cc2c3nsnc3c3c4sccc4[nH]c3c2s1,0,0,0,0,0,352.469,351.991109,344.405,110,...,0,1,0,0,0,0,0,0,0,0
2,[nH]1c2cc(-c3ccc[se]3)c3nsnc3c2c2c3cscc3c3ccc4...,1,0,0,0,2,514.569,514.948537,501.465,146,...,0,1,0,0,0,0,0,0,0,0
3,[nH]1c(cc2cnc3c(c12)c1=C[SiH2]C=c1c1ccc2=CCC=c...,2,0,0,0,4,376.491,376.10319,360.363,132,...,0,1,0,0,0,0,0,0,0,0
4,c1sc(-c2sc(-c3sc(-c4scc5[se]ccc45)c4ccoc34)c3c...,3,0,0,0,0,569.637,569.844956,559.557,154,...,0,1,0,0,0,0,0,0,0,0


In [110]:
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
# Drop the 'smiles' column
df_test= df_test.drop(['smiles'], axis=1)
X_test = df_test.values


In [None]:
X_test[-1]

In [None]:
LR_pred_test = LR2.predict(X_test)

In [None]:
print LR_pred_test[:50]

In [None]:
print LR2.coef_

In [114]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
write_to_file("feat_eng_no_outliers_LR_pred.csv", LR_pred_test)
#write_to_file("sample2.csv", RF_pred)

# Using PCA and Extra Trees Random Forest Regressor
First, split training to check RMSE of validation set

In [None]:
# Partition Training Data into Training, Validation
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.33)

In [103]:
pca = PCA(n_components=60)
extraTrees_pca = ExtraTreesRegressor(n_estimators=100,n_jobs=2)
# extraTrees = ExtraTreesRegressor(n_estimators=25,n_jobs=2)

cross_X_train_transf = pca.fit_transform(cross_X_train)
cross_X_valid_transf = pca.transform(cross_X_valid)

tree_est_wPCA = extraTrees_pca.fit(cross_X_train_transf, cross_Y_train)
# tree_estimator = extraTrees.fit(X_train, Y_train)

pca_exTree_pred = tree_est_wPCA.predict(cross_X_valid_transf)
# exTree_pred = tree_estimator.predict(X_test)

In [105]:
# Calculate RMSE
mean_squared_error(cross_Y_valid, pca_exTree_pred)**.5

0.15722361839647497

In [None]:
extraTrees = ExtraTreesRegressor(n_estimators=300,n_jobs=2)
tree_est = extraTrees.fit(cross_X_train, cross_Y_train)
exTree_pred = tree_est.predict(cross_X_valid)

In [None]:
# Calculate RMSE
mean_squared_error(cross_Y_valid, exTree_pred)

## RMSE
No Outliers, feature eng, pca (60 components), extra trees (100 estimators): 0.052341766823921068

No Outliers, feature eng, extra trees (100 estimators):

# Train RF, PCA model with entire training dataset and predict test set


In [111]:
pca = PCA(n_components=60)
extraTrees_pca = ExtraTreesRegressor(n_estimators=100,n_jobs=2)
# extraTrees = ExtraTreesRegressor(n_estimators=25,n_jobs=2)

X_transf = pca.fit_transform(X_train)
X_test_transf = pca.transform(X_test)

tree_est_wPCA = extraTrees_pca.fit(X_transf, Y_train)
# tree_estimator = extraTrees.fit(X_train, Y_train)

pca_exTree_pred = tree_est_wPCA.predict(X_test_transf)
# exTree_pred = tree_estimator.predict(X_test)

In [112]:
pca_exTree_pred[:50]

array([ 1.80083333,  2.00329196,  1.412     ,  1.36333333,  1.53403509,
        2.22      ,  2.05479929,  1.855     ,  1.17818182,  2.260625  ,
        2.13      ,  2.14      ,  2.205     ,  1.634     ,  2.24315789,
        2.1625    ,  2.3590566 ,  2.03833333,  1.88      ,  1.88782609,
        2.22333333,  2.034     ,  1.87875   ,  1.98      ,  1.95      ,
        2.0925    ,  1.3046988 ,  2.08416667,  2.44653846,  2.4020405 ,
        1.614     ,  2.20041667,  1.7825    ,  1.53666667,  2.29      ,
        2.34571429,  1.75790476,  1.73      ,  1.88769231,  2.045     ,
        2.05785714,  2.18833333,  2.1225    ,  1.452     ,  1.81694444,
        1.415     ,  2.10564516,  1.485     ,  2.35422535,  1.7504    ])

In [115]:
write_to_file("pcaExtraTree_60comp_sam_rdk_feats.csv", pca_exTree_pred)