In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, ElasticNetCV, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from rdkit import DataStructs, Chem
from rdkit.Chem import AllChem, Descriptors, Lipinski, Fragments, rdmolops
from rdkit.ML.Cluster import Butina

# *IF FEATURE ENGINEERING IS DONE: DO NOT START HERE*
# *SKIP TO THE START OF SECTION IV: MODEL SELECTION*

# I. Exploratory Data Analysis

In [None]:
# Read in training data
df_train = pd.read_csv("train.csv")


In [None]:
df_train.head()

In [None]:
plt.hist(df_train.gap.values, bins=50)

### Note: 
I see some outliers around -1.5.  Having negative values makes no sense, since the gap is highest - lowest -- they must be misrecorded.  ==> Remove them

In [None]:
# Remove negative HOMO-LUMO gaps
df_train = df_train[df_train['gap'] >= 0]

In [None]:
print len(df_train)

In [None]:
# Plot histogram of gaps without negative values
plt.hist(df_train.gap.values, bins=50)

# II. Training Data Processing and Feature Engineering
1. Drop all columns except smiles and gaps
2. Add "key" column, partition the data, and save those partitions
3. Remove all dataframes from memory
4. Load each partition (1 by 1), perform feature engineering, and save each partition to a csv
5. Reaggregate all partitions into one df, merge add binary-valued features from original training dataset, and save to csv

### 1. Drop all columns except smiles and gaps

In [None]:
# remove all columns except smiles and gaps
df_train = df_train.drop(df_train.columns[range(1,257)], axis=1)
df_train.head()

### 2. Add "key" column, partition the data, and save those partitions

In [None]:
# add key column
df_train['key'] = df_train.index

In [None]:
df_train.head()

In [None]:
num_parts = 10
df_train_parts = np.array_split(df_train, num_parts)
for i in xrange(len(df_train_parts)):
    df_train_parts[i].to_csv('smiles_gaps_keys_df_train_part_'+str(i)+'.csv', index=False)

### 3. Remove all dataframes from memory

In [None]:
del df_train
del df_train_parts

### 4. Load each partition (1 by 1), perform feature engineering, and save each partition to a csv

(http://machinelearningmastery.com/discover-feature-engineering-how-to-engineer-features-and-how-to-get-good-at-it/). 

#### Number of Branches
The idea is that the number of  [branches](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system#Branching) influences the gap between HOMO and LUMO levels.

#### Note: We pull the specific compounds of intereste from [this poster.](http://digitalscholarship.unlv.edu/cgi/viewcontent.cgi?article=1036&context=focs_ug_research)

#### Benzene Ring
The last feature is determining, from the SMILES encoding whether or not there is a benzene ring in the compound. Benzene rings are held together with pi bonds which are more conjugated (which means they are closer together in energy)

#### Number of Double Bonds

#### Additional Features engineered using from RDKit library

In [None]:
for i in xrange(num_parts):
    df_train_part = pd.read_csv('smiles_gaps_keys_df_train_part_'+str(i)+'.csv')
    
    # Apply feature engineering to train data
    num_branches = df_train_part.smiles.apply(lambda x: x.count('('))
    has_benzothiophene = df_train_part.smiles.apply(lambda x: min(1,x.count('s2c1ccccc1cc2')))
    has_carbazole = df_train_part.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3[nH]2')))
    has_fluorene = df_train_part.smiles.apply(lambda x: min(1,x.count('c1ccc-2c(c1)Cc3c2cccc3')))
    num_double_bonds = df_train_part.smiles.apply(lambda x: x.count('='))
    df_train_part['num_branches'] = num_branches
    df_train_part['has_benzothiophene'] = has_benzothiophene
    df_train_part['has_carbazole'] = has_carbazole
    df_train_part['has_fluorene'] = has_fluorene
    df_train_part['num_double_bonds'] = num_double_bonds
    print "Part {}: finished initial feat gen".format(i)
    # RDKit Feature Engineering
    # Generate molecule objects
    molecules = df_train_part.smiles.apply(lambda x: Chem.MolFromSmiles(x))
    print "Part {}: finished molecule generation".format(i)
    # Generate Features
    df_train_part['avg_molecular_weight'] = molecules.apply(lambda x: Descriptors.MolWt(x))
    print "Part {}: finished first feature".format(i)
    df_train_part['exact_molecular_weight'] = molecules.apply(lambda x: Descriptors.ExactMolWt(x))
    df_train_part['avg_molecular_weight_ignore_hydrogen'] = molecules.apply(lambda x: Descriptors.HeavyAtomMolWt(x))
    df_train_part['num_valence_electrons'] = molecules.apply(lambda x: Descriptors.NumValenceElectrons(x))
    df_train_part['num_radical_electrons'] = molecules.apply(lambda x: Descriptors.NumRadicalElectrons(x))
    df_train_part['formal_charge'] = molecules.apply(lambda x: rdmolops.GetFormalCharge(x))
    df_train_part['sssr'] = molecules.apply(lambda x: rdmolops.GetSSSR(x))
    print "Part {}: finished Descriptors and rdmolops".format(i)
    df_train_part['fraction_csp3'] = molecules.apply(lambda x: Lipinski.FractionCSP3(x))
    df_train_part['num_aliphatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticCarbocycles(x))
    df_train_part['num_aliphatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticHeterocycles(x))
    df_train_part['num_aliphatic_rings'] = molecules.apply(lambda x: Lipinski.NumAliphaticRings(x))
    df_train_part['num_aromatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticCarbocycles(x))
    df_train_part['num_aromatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticHeterocycles(x))
    df_train_part['num_aromatic_rings'] = molecules.apply(lambda x: Lipinski.NumAromaticRings(x))
    df_train_part['num_saturated_carbocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedCarbocycles(x))
    df_train_part['num_saturated_heterocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedHeterocycles(x))
    df_train_part['num_saturated_rings'] = molecules.apply(lambda x: Lipinski.NumSaturatedRings(x))
    df_train_part['num_nh_oh'] = molecules.apply(lambda x: Lipinski.NHOHCount(x))
    df_train_part['num_num_rotatable_bonds'] = molecules.apply(lambda x: Lipinski.NumRotatableBonds(x))
    df_train_part['num_heteroatoms'] = molecules.apply(lambda x: Lipinski.NumHeteroatoms(x))
    df_train_part['num_h_acceptors'] = molecules.apply(lambda x: Lipinski.NumHAcceptors(x))
    df_train_part['num_h_donors'] = molecules.apply(lambda x: Lipinski.NumHDonors(x))
    df_train_part['ring_count'] = molecules.apply(lambda x: Lipinski.RingCount(x))
    print "Part {}: finished Lipinski".format(i)
    # See Parsing_methods_from_rdk_source.ipynb
    df_train_part['fr_Al_COO'] = molecules.apply(lambda x: Fragments.fr_Al_COO(x))
    df_train_part['fr_Al_OH'] = molecules.apply(lambda x: Fragments.fr_Al_OH(x))
    df_train_part['fr_Al_OH_noTert'] = molecules.apply(lambda x: Fragments.fr_Al_OH_noTert(x))
    df_train_part['fr_ArN'] = molecules.apply(lambda x: Fragments.fr_ArN(x))
    df_train_part['fr_Ar_COO'] = molecules.apply(lambda x: Fragments.fr_Ar_COO(x))
    df_train_part['fr_Ar_N'] = molecules.apply(lambda x: Fragments.fr_Ar_N(x))
    df_train_part['fr_Ar_NH'] = molecules.apply(lambda x: Fragments.fr_Ar_NH(x))
    df_train_part['fr_Ar_OH'] = molecules.apply(lambda x: Fragments.fr_Ar_OH(x))
    df_train_part['fr_COO'] = molecules.apply(lambda x: Fragments.fr_COO(x))
    df_train_part['fr_COO2'] = molecules.apply(lambda x: Fragments.fr_COO2(x))
    df_train_part['fr_C_O'] = molecules.apply(lambda x: Fragments.fr_C_O(x))
    df_train_part['fr_C_O_noCOO'] = molecules.apply(lambda x: Fragments.fr_C_O_noCOO(x))
    df_train_part['fr_C_S'] = molecules.apply(lambda x: Fragments.fr_C_S(x))
    df_train_part['fr_HOCCN'] = molecules.apply(lambda x: Fragments.fr_HOCCN(x))
    df_train_part['fr_Imine'] = molecules.apply(lambda x: Fragments.fr_Imine(x))
    df_train_part['fr_NH0'] = molecules.apply(lambda x: Fragments.fr_NH0(x))
    df_train_part['fr_NH1'] = molecules.apply(lambda x: Fragments.fr_NH1(x))
    df_train_part['fr_NH2'] = molecules.apply(lambda x: Fragments.fr_NH2(x))
    df_train_part['fr_N_O'] = molecules.apply(lambda x: Fragments.fr_N_O(x))
    df_train_part['fr_Ndealkylation1'] = molecules.apply(lambda x: Fragments.fr_Ndealkylation1(x))
    df_train_part['fr_Ndealkylation2'] = molecules.apply(lambda x: Fragments.fr_Ndealkylation2(x))
    df_train_part['fr_Nhpyrrole'] = molecules.apply(lambda x: Fragments.fr_Nhpyrrole(x))
    df_train_part['fr_SH'] = molecules.apply(lambda x: Fragments.fr_SH(x))
    df_train_part['fr_aldehyde'] = molecules.apply(lambda x: Fragments.fr_aldehyde(x))
    df_train_part['fr_alkyl_carbamate'] = molecules.apply(lambda x: Fragments.fr_alkyl_carbamate(x))
    df_train_part['fr_alkyl_halide'] = molecules.apply(lambda x: Fragments.fr_alkyl_halide(x))
    df_train_part['fr_allylic_oxid'] = molecules.apply(lambda x: Fragments.fr_allylic_oxid(x))
    df_train_part['fr_amide'] = molecules.apply(lambda x: Fragments.fr_amide(x))
    df_train_part['fr_amidine'] = molecules.apply(lambda x: Fragments.fr_amidine(x))
    df_train_part['fr_aniline'] = molecules.apply(lambda x: Fragments.fr_aniline(x))
    df_train_part['fr_aryl_methyl'] = molecules.apply(lambda x: Fragments.fr_aryl_methyl(x))
    df_train_part['fr_azide'] = molecules.apply(lambda x: Fragments.fr_azide(x))
    df_train_part['fr_azo'] = molecules.apply(lambda x: Fragments.fr_azo(x))
    df_train_part['fr_barbitur'] = molecules.apply(lambda x: Fragments.fr_barbitur(x))
    df_train_part['fr_benzene'] = molecules.apply(lambda x: Fragments.fr_benzene(x))
    df_train_part['fr_benzodiazepine'] = molecules.apply(lambda x: Fragments.fr_benzodiazepine(x))
    df_train_part['fr_bicyclic'] = molecules.apply(lambda x: Fragments.fr_bicyclic(x))
    df_train_part['fr_diazo'] = molecules.apply(lambda x: Fragments.fr_diazo(x))
    df_train_part['fr_dihydropyridine'] = molecules.apply(lambda x: Fragments.fr_dihydropyridine(x))
    df_train_part['fr_epoxide'] = molecules.apply(lambda x: Fragments.fr_epoxide(x))
    df_train_part['fr_ester'] = molecules.apply(lambda x: Fragments.fr_ester(x))
    df_train_part['fr_ether'] = molecules.apply(lambda x: Fragments.fr_ether(x))
    df_train_part['fr_furan'] = molecules.apply(lambda x: Fragments.fr_furan(x))
    df_train_part['fr_guanido'] = molecules.apply(lambda x: Fragments.fr_guanido(x))
    df_train_part['fr_halogen'] = molecules.apply(lambda x: Fragments.fr_halogen(x))
    df_train_part['fr_hdrzine'] = molecules.apply(lambda x: Fragments.fr_hdrzine(x))
    df_train_part['fr_hdrzone'] = molecules.apply(lambda x: Fragments.fr_hdrzone(x))
    df_train_part['fr_imidazole'] = molecules.apply(lambda x: Fragments.fr_imidazole(x))
    df_train_part['fr_imide'] = molecules.apply(lambda x: Fragments.fr_imide(x))
    df_train_part['fr_isocyan'] = molecules.apply(lambda x: Fragments.fr_isocyan(x))
    df_train_part['fr_isothiocyan'] = molecules.apply(lambda x: Fragments.fr_isothiocyan(x))
    df_train_part['fr_ketone'] = molecules.apply(lambda x: Fragments.fr_ketone(x))
    df_train_part['fr_lactam'] = molecules.apply(lambda x: Fragments.fr_lactam(x))
    df_train_part['fr_lactone'] = molecules.apply(lambda x: Fragments.fr_lactone(x))
    df_train_part['fr_methoxy'] = molecules.apply(lambda x: Fragments.fr_methoxy(x))
    df_train_part['fr_morpholine'] = molecules.apply(lambda x: Fragments.fr_morpholine(x))
    df_train_part['fr_nitrile'] = molecules.apply(lambda x: Fragments.fr_nitrile(x))
    df_train_part['fr_nitro'] = molecules.apply(lambda x: Fragments.fr_nitro(x))
    df_train_part['fr_nitro_arom'] = molecules.apply(lambda x: Fragments.fr_nitro_arom(x))
    df_train_part['fr_nitro_arom_nonortho'] = molecules.apply(lambda x: Fragments.fr_nitro_arom_nonortho(x))
    df_train_part['fr_nitroso'] = molecules.apply(lambda x: Fragments.fr_nitroso(x))
    df_train_part['fr_oxazole'] = molecules.apply(lambda x: Fragments.fr_oxazole(x))
    df_train_part['fr_oxime'] = molecules.apply(lambda x: Fragments.fr_oxime(x))
    df_train_part['fr_para_hydroxylation'] = molecules.apply(lambda x: Fragments.fr_para_hydroxylation(x))
    df_train_part['fr_phenol'] = molecules.apply(lambda x: Fragments.fr_phenol(x))
    df_train_part['fr_phenol_noOrthoHbond'] = molecules.apply(lambda x: Fragments.fr_phenol_noOrthoHbond(x))
    df_train_part['fr_phos_acid'] = molecules.apply(lambda x: Fragments.fr_phos_acid(x))
    df_train_part['fr_phos_ester'] = molecules.apply(lambda x: Fragments.fr_phos_ester(x))
    df_train_part['fr_piperdine'] = molecules.apply(lambda x: Fragments.fr_piperdine(x))
    df_train_part['fr_piperzine'] = molecules.apply(lambda x: Fragments.fr_piperzine(x))
    df_train_part['fr_priamide'] = molecules.apply(lambda x: Fragments.fr_priamide(x))
    df_train_part['fr_prisulfonamd'] = molecules.apply(lambda x: Fragments.fr_prisulfonamd(x))
    df_train_part['fr_pyridine'] = molecules.apply(lambda x: Fragments.fr_pyridine(x))
    df_train_part['fr_quatN'] = molecules.apply(lambda x: Fragments.fr_quatN(x))
    df_train_part['fr_sulfide'] = molecules.apply(lambda x: Fragments.fr_sulfide(x))
    df_train_part['fr_sulfonamd'] = molecules.apply(lambda x: Fragments.fr_sulfonamd(x))
    df_train_part['fr_sulfone'] = molecules.apply(lambda x: Fragments.fr_sulfone(x))
    df_train_part['fr_term_acetylene'] = molecules.apply(lambda x: Fragments.fr_term_acetylene(x))
    df_train_part['fr_tetrazole'] = molecules.apply(lambda x: Fragments.fr_tetrazole(x))
    df_train_part['fr_thiazole'] = molecules.apply(lambda x: Fragments.fr_thiazole(x))
    df_train_part['fr_thiocyan'] = molecules.apply(lambda x: Fragments.fr_thiocyan(x))
    df_train_part['fr_thiophene'] = molecules.apply(lambda x: Fragments.fr_thiophene(x))
    df_train_part['fr_unbrch_alkane'] = molecules.apply(lambda x: Fragments.fr_unbrch_alkane(x))
    df_train_part['fr_urea'] = molecules.apply(lambda x: Fragments.fr_urea(x))
    df_train_part['fr_ketone_Topliss'] = molecules.apply(lambda x: Fragments.fr_ketone_Topliss(x))
    
    print "Part {}: finished Fragments".format(i)
    df_train_part.to_csv('rdk_feat_eng_df_train_part_'+str(i)+'.csv', index=False)
    del df_train_part
    del molecules

### 5. Reaggregate all partitions into one df, merge add binary-valued features from original training dataset, and save to csv

In [None]:
# Read in each feature engineered partition
df_train_parts = [pd.read_csv('rdk_feat_eng_df_train_part_'+str(i)+'.csv') for i in xrange(num_parts)]
# concatenate them into one df
df_train = pd.concat(df_train_parts)

In [None]:
# Read in old training
# Note: df_train_old has same as before partitioning
df_train_old = pd.read_csv("train.csv")
# Add key
df_train_old['key'] = df_train_old.index
# merge dataframes on 'key'
df_train = df_train.merge(df_train_old, on=["key"])
# Fix columns
df_train = df_train.drop(['gap_x'],axis=1)
df_train = df_train.drop(['key'],axis=1)
df_train = df_train.rename(columns = {'gap_y':'gap'})
df_train = df_train.drop(['smiles_x'],axis=1)
df_train = df_train.rename(columns = {'smiles_y':'smiles'})

In [None]:
df_train.head()

In [None]:
# Save df_train
df_train.to_csv('FINAL_train.csv', index=False)

In [None]:
# Remove df_train for now
del df_train_parts
del df_train_old
del df_train

# III. Test Data Processing and Feature Engineering
1. Drop all columns except smiles and ids
2. Partition the data and save those partitions
3. Remove all dataframes from memory
4. Load each partition (1 by 1), perform feature engineering, and save each partition to a csv
5. Reaggregate all partitions into one df, merge add binary-valued features from original test dataset, and save to csv

In [None]:
# Read in test data
df_test = pd.read_csv("test.csv")
df_test.head()

### 1. Drop all columns except smiles and ids

In [None]:
df_test = df_test.drop(df_test.columns[range(2,258)], axis=1)

In [None]:
df_test.head()

### 2. Partition the data and save those partitions

In [None]:
num_parts = 10

In [None]:
df_test_parts = np.array_split(df_test, num_parts)
for i in xrange(num_parts):
    df_test_parts[i].to_csv('smiles_and_ids_df_test_part_'+str(i)+'.csv', index=False)

### 3. Remove all dataframes from memory

In [None]:
del df_test
del df_test_parts

### 4. Load each partition (1 by 1), perform feature engineering, and save each partition to a csv

In [None]:
for i in xrange(num_parts):
    df_test_part = pd.read_csv('smiles_and_ids_df_test_part_'+str(i)+'.csv')
  

    # Apply feature engineering to train data
    num_branches = df_test_part.smiles.apply(lambda x: x.count('('))
    has_benzothiophene = df_test_part.smiles.apply(lambda x: min(1,x.count('s2c1ccccc1cc2')))
    has_carbazole = df_test_part.smiles.apply(lambda x: min(1,x.count('c1ccc2c(c1)c3ccccc3[nH]2')))
    has_fluorene = df_test_part.smiles.apply(lambda x: min(1,x.count('c1ccc-2c(c1)Cc3c2cccc3')))
    num_double_bonds = df_test_part.smiles.apply(lambda x: x.count('='))
    df_test_part['num_branches'] = num_branches
    df_test_part['has_benzothiophene'] = has_benzothiophene
    df_test_part['has_carbazole'] = has_carbazole
    df_test_part['has_fluorene'] = has_fluorene
    df_test_part['num_double_bonds'] = num_double_bonds
    print "Part {}: finished initial feat gen".format(i)
    # RDKit Feature Engineering
    # Generate molecule objects
    molecules = df_test_part.smiles.apply(lambda x: Chem.MolFromSmiles(x))
    print "Part {}: finished molecule generation".format(i)
    # Generate Features
    df_test_part['avg_molecular_weight'] = molecules.apply(lambda x: Descriptors.MolWt(x))
    print "Part {}: finished first feature".format(i)
    df_test_part['exact_molecular_weight'] = molecules.apply(lambda x: Descriptors.ExactMolWt(x))
    df_test_part['avg_molecular_weight_ignore_hydrogen'] = molecules.apply(lambda x: Descriptors.HeavyAtomMolWt(x))
    df_test_part['num_valence_electrons'] = molecules.apply(lambda x: Descriptors.NumValenceElectrons(x))
    df_test_part['num_radical_electrons'] = molecules.apply(lambda x: Descriptors.NumRadicalElectrons(x))
    df_test_part['formal_charge'] = molecules.apply(lambda x: rdmolops.GetFormalCharge(x))
    df_test_part['sssr'] = molecules.apply(lambda x: rdmolops.GetSSSR(x))
    print "Part {}: finished Descriptors and rdmolops".format(i)
    df_test_part['fraction_csp3'] = molecules.apply(lambda x: Lipinski.FractionCSP3(x))
    df_test_part['num_aliphatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticCarbocycles(x))
    df_test_part['num_aliphatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAliphaticHeterocycles(x))
    df_test_part['num_aliphatic_rings'] = molecules.apply(lambda x: Lipinski.NumAliphaticRings(x))
    df_test_part['num_aromatic_carbocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticCarbocycles(x))
    df_test_part['num_aromatic_heterocycles'] = molecules.apply(lambda x: Lipinski.NumAromaticHeterocycles(x))
    df_test_part['num_aromatic_rings'] = molecules.apply(lambda x: Lipinski.NumAromaticRings(x))
    df_test_part['num_saturated_carbocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedCarbocycles(x))
    df_test_part['num_saturated_heterocycles'] = molecules.apply(lambda x: Lipinski.NumSaturatedHeterocycles(x))
    df_test_part['num_saturated_rings'] = molecules.apply(lambda x: Lipinski.NumSaturatedRings(x))
    df_test_part['num_nh_oh'] = molecules.apply(lambda x: Lipinski.NHOHCount(x))
    df_test_part['num_num_rotatable_bonds'] = molecules.apply(lambda x: Lipinski.NumRotatableBonds(x))
    df_test_part['num_heteroatoms'] = molecules.apply(lambda x: Lipinski.NumHeteroatoms(x))
    df_test_part['num_h_acceptors'] = molecules.apply(lambda x: Lipinski.NumHAcceptors(x))
    df_test_part['num_h_donors'] = molecules.apply(lambda x: Lipinski.NumHDonors(x))
    df_test_part['ring_count'] = molecules.apply(lambda x: Lipinski.RingCount(x))
    print "Part {}: finished Lipinski".format(i)
    # See Parsing_methods_from_rdk_source.ipynb
    df_test_part['fr_Al_COO'] = molecules.apply(lambda x: Fragments.fr_Al_COO(x))
    df_test_part['fr_Al_OH'] = molecules.apply(lambda x: Fragments.fr_Al_OH(x))
    df_test_part['fr_Al_OH_noTert'] = molecules.apply(lambda x: Fragments.fr_Al_OH_noTert(x))
    df_test_part['fr_ArN'] = molecules.apply(lambda x: Fragments.fr_ArN(x))
    df_test_part['fr_Ar_COO'] = molecules.apply(lambda x: Fragments.fr_Ar_COO(x))
    df_test_part['fr_Ar_N'] = molecules.apply(lambda x: Fragments.fr_Ar_N(x))
    df_test_part['fr_Ar_NH'] = molecules.apply(lambda x: Fragments.fr_Ar_NH(x))
    df_test_part['fr_Ar_OH'] = molecules.apply(lambda x: Fragments.fr_Ar_OH(x))
    df_test_part['fr_COO'] = molecules.apply(lambda x: Fragments.fr_COO(x))
    df_test_part['fr_COO2'] = molecules.apply(lambda x: Fragments.fr_COO2(x))
    df_test_part['fr_C_O'] = molecules.apply(lambda x: Fragments.fr_C_O(x))
    df_test_part['fr_C_O_noCOO'] = molecules.apply(lambda x: Fragments.fr_C_O_noCOO(x))
    df_test_part['fr_C_S'] = molecules.apply(lambda x: Fragments.fr_C_S(x))
    df_test_part['fr_HOCCN'] = molecules.apply(lambda x: Fragments.fr_HOCCN(x))
    df_test_part['fr_Imine'] = molecules.apply(lambda x: Fragments.fr_Imine(x))
    df_test_part['fr_NH0'] = molecules.apply(lambda x: Fragments.fr_NH0(x))
    df_test_part['fr_NH1'] = molecules.apply(lambda x: Fragments.fr_NH1(x))
    df_test_part['fr_NH2'] = molecules.apply(lambda x: Fragments.fr_NH2(x))
    df_test_part['fr_N_O'] = molecules.apply(lambda x: Fragments.fr_N_O(x))
    df_test_part['fr_Ndealkylation1'] = molecules.apply(lambda x: Fragments.fr_Ndealkylation1(x))
    df_test_part['fr_Ndealkylation2'] = molecules.apply(lambda x: Fragments.fr_Ndealkylation2(x))
    df_test_part['fr_Nhpyrrole'] = molecules.apply(lambda x: Fragments.fr_Nhpyrrole(x))
    df_test_part['fr_SH'] = molecules.apply(lambda x: Fragments.fr_SH(x))
    df_test_part['fr_aldehyde'] = molecules.apply(lambda x: Fragments.fr_aldehyde(x))
    df_test_part['fr_alkyl_carbamate'] = molecules.apply(lambda x: Fragments.fr_alkyl_carbamate(x))
    df_test_part['fr_alkyl_halide'] = molecules.apply(lambda x: Fragments.fr_alkyl_halide(x))
    df_test_part['fr_allylic_oxid'] = molecules.apply(lambda x: Fragments.fr_allylic_oxid(x))
    df_test_part['fr_amide'] = molecules.apply(lambda x: Fragments.fr_amide(x))
    df_test_part['fr_amidine'] = molecules.apply(lambda x: Fragments.fr_amidine(x))
    df_test_part['fr_aniline'] = molecules.apply(lambda x: Fragments.fr_aniline(x))
    df_test_part['fr_aryl_methyl'] = molecules.apply(lambda x: Fragments.fr_aryl_methyl(x))
    df_test_part['fr_azide'] = molecules.apply(lambda x: Fragments.fr_azide(x))
    df_test_part['fr_azo'] = molecules.apply(lambda x: Fragments.fr_azo(x))
    df_test_part['fr_barbitur'] = molecules.apply(lambda x: Fragments.fr_barbitur(x))
    df_test_part['fr_benzene'] = molecules.apply(lambda x: Fragments.fr_benzene(x))
    df_test_part['fr_benzodiazepine'] = molecules.apply(lambda x: Fragments.fr_benzodiazepine(x))
    df_test_part['fr_bicyclic'] = molecules.apply(lambda x: Fragments.fr_bicyclic(x))
    df_test_part['fr_diazo'] = molecules.apply(lambda x: Fragments.fr_diazo(x))
    df_test_part['fr_dihydropyridine'] = molecules.apply(lambda x: Fragments.fr_dihydropyridine(x))
    df_test_part['fr_epoxide'] = molecules.apply(lambda x: Fragments.fr_epoxide(x))
    df_test_part['fr_ester'] = molecules.apply(lambda x: Fragments.fr_ester(x))
    df_test_part['fr_ether'] = molecules.apply(lambda x: Fragments.fr_ether(x))
    df_test_part['fr_furan'] = molecules.apply(lambda x: Fragments.fr_furan(x))
    df_test_part['fr_guanido'] = molecules.apply(lambda x: Fragments.fr_guanido(x))
    df_test_part['fr_halogen'] = molecules.apply(lambda x: Fragments.fr_halogen(x))
    df_test_part['fr_hdrzine'] = molecules.apply(lambda x: Fragments.fr_hdrzine(x))
    df_test_part['fr_hdrzone'] = molecules.apply(lambda x: Fragments.fr_hdrzone(x))
    df_test_part['fr_imidazole'] = molecules.apply(lambda x: Fragments.fr_imidazole(x))
    df_test_part['fr_imide'] = molecules.apply(lambda x: Fragments.fr_imide(x))
    df_test_part['fr_isocyan'] = molecules.apply(lambda x: Fragments.fr_isocyan(x))
    df_test_part['fr_isothiocyan'] = molecules.apply(lambda x: Fragments.fr_isothiocyan(x))
    df_test_part['fr_ketone'] = molecules.apply(lambda x: Fragments.fr_ketone(x))
    df_test_part['fr_lactam'] = molecules.apply(lambda x: Fragments.fr_lactam(x))
    df_test_part['fr_lactone'] = molecules.apply(lambda x: Fragments.fr_lactone(x))
    df_test_part['fr_methoxy'] = molecules.apply(lambda x: Fragments.fr_methoxy(x))
    df_test_part['fr_morpholine'] = molecules.apply(lambda x: Fragments.fr_morpholine(x))
    df_test_part['fr_nitrile'] = molecules.apply(lambda x: Fragments.fr_nitrile(x))
    df_test_part['fr_nitro'] = molecules.apply(lambda x: Fragments.fr_nitro(x))
    df_test_part['fr_nitro_arom'] = molecules.apply(lambda x: Fragments.fr_nitro_arom(x))
    df_test_part['fr_nitro_arom_nonortho'] = molecules.apply(lambda x: Fragments.fr_nitro_arom_nonortho(x))
    df_test_part['fr_nitroso'] = molecules.apply(lambda x: Fragments.fr_nitroso(x))
    df_test_part['fr_oxazole'] = molecules.apply(lambda x: Fragments.fr_oxazole(x))
    df_test_part['fr_oxime'] = molecules.apply(lambda x: Fragments.fr_oxime(x))
    df_test_part['fr_para_hydroxylation'] = molecules.apply(lambda x: Fragments.fr_para_hydroxylation(x))
    df_test_part['fr_phenol'] = molecules.apply(lambda x: Fragments.fr_phenol(x))
    df_test_part['fr_phenol_noOrthoHbond'] = molecules.apply(lambda x: Fragments.fr_phenol_noOrthoHbond(x))
    df_test_part['fr_phos_acid'] = molecules.apply(lambda x: Fragments.fr_phos_acid(x))
    df_test_part['fr_phos_ester'] = molecules.apply(lambda x: Fragments.fr_phos_ester(x))
    df_test_part['fr_piperdine'] = molecules.apply(lambda x: Fragments.fr_piperdine(x))
    df_test_part['fr_piperzine'] = molecules.apply(lambda x: Fragments.fr_piperzine(x))
    df_test_part['fr_priamide'] = molecules.apply(lambda x: Fragments.fr_priamide(x))
    df_test_part['fr_prisulfonamd'] = molecules.apply(lambda x: Fragments.fr_prisulfonamd(x))
    df_test_part['fr_pyridine'] = molecules.apply(lambda x: Fragments.fr_pyridine(x))
    df_test_part['fr_quatN'] = molecules.apply(lambda x: Fragments.fr_quatN(x))
    df_test_part['fr_sulfide'] = molecules.apply(lambda x: Fragments.fr_sulfide(x))
    df_test_part['fr_sulfonamd'] = molecules.apply(lambda x: Fragments.fr_sulfonamd(x))
    df_test_part['fr_sulfone'] = molecules.apply(lambda x: Fragments.fr_sulfone(x))
    df_test_part['fr_term_acetylene'] = molecules.apply(lambda x: Fragments.fr_term_acetylene(x))
    df_test_part['fr_tetrazole'] = molecules.apply(lambda x: Fragments.fr_tetrazole(x))
    df_test_part['fr_thiazole'] = molecules.apply(lambda x: Fragments.fr_thiazole(x))
    df_test_part['fr_thiocyan'] = molecules.apply(lambda x: Fragments.fr_thiocyan(x))
    df_test_part['fr_thiophene'] = molecules.apply(lambda x: Fragments.fr_thiophene(x))
    df_test_part['fr_unbrch_alkane'] = molecules.apply(lambda x: Fragments.fr_unbrch_alkane(x))
    df_test_part['fr_urea'] = molecules.apply(lambda x: Fragments.fr_urea(x))
    df_test_part['fr_ketone_Topliss'] = molecules.apply(lambda x: Fragments.fr_ketone_Topliss(x))
    
    print "Part {}: finished Fragments".format(i)
    df_test_part.to_csv('rdk_feat_eng_df_test_part_'+str(i)+'.csv', index=False)
    del df_test_part
    del molecules

### 5. Reaggregate all partitions into one df, merge add binary-valued features from original test dataset, and save to csv

In [None]:
# read in each feat eng part
df_test_parts = [pd.read_csv('rdk_feat_eng_df_test_part_'+str(i)+'.csv') for i in xrange(num_parts)]

In [None]:
# combine into one df
df_test = pd.concat(df_test_parts)

In [None]:
# Read in old test data and merge with new features
df_test_old = pd.read_csv("test.csv")
df_test = df_test.merge(df_test_old, on=["Id"])
# Fix columns
df_test = df_test.drop(['smiles_x'],axis=1)
df_test = df_test.rename(columns = {'smiles_y':'smiles'})
df_test.head()

In [None]:
# Save to csv
df_test.to_csv('FINAL_test.csv', index=False)

In [None]:
# Delete old dfs
del df_test_old
del df_test_parts
del df_test

# *IF FEATURE ENGINEERING IS DONE: START HERE*
# IV. Model Selection 
1. Read training data, store output values, and remove output values from df_train
2. Split training data into training set and validation set
3. Model Selection -  Test various models
    1. Linear Regression
    2. Transform data using PCA
    3. Random Forest (extra trees) using PCA
    4. ... (TAYLOR AND ANDREW)

### 1. Read training data, store output values, and remove output values from df_train

In [2]:
df_train = pd.read_csv('FINAL_train.csv')

In [3]:
df_train.head()

Unnamed: 0,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,num_radical_electrons,...,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256,gap
0,3,0,0,0,0,470.462,470.907296,461.39,130,0,...,1,0,0,0,0,0,0,0,0,1.19
1,1,0,0,0,5,352.545,352.085202,336.417,118,0,...,1,0,0,1,0,0,0,0,0,1.6
2,2,0,0,0,1,399.576,399.032016,386.472,128,0,...,1,0,0,0,1,0,0,0,0,1.49
3,1,0,0,0,4,379.567,379.084867,362.431,128,0,...,1,0,0,0,1,0,0,0,0,1.36
4,1,0,0,0,0,396.391,396.042944,388.327,136,0,...,1,0,0,0,0,0,0,0,0,1.98


In [4]:
# Drop the 'smiles' column 
smiles = df_train.smiles.values
df_train = df_train.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape

Train features: (999997, 369)
Train gap: (999997,)


In [8]:
# train rf regressor
rf_est = RandomForestRegressor(n_estimators=128,n_jobs=2, max_features='sqrt')
rf_est.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=128, n_jobs=2, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

## Feature Selection

In [9]:
# get feature names and importances
feats = zip(df_train.columns, rf_est.feature_importances_)
feats = sorted(feats, key=lambda x: x[1], reverse=True)
feats

[('fr_bicyclic', 0.066837312084785411),
 ('num_heteroatoms', 0.06621279132514378),
 ('feat_025', 0.057977357415829621),
 ('num_valence_electrons', 0.052716855768966224),
 ('feat_102', 0.038866604365385138),
 ('exact_molecular_weight', 0.038202059146884862),
 ('avg_molecular_weight', 0.038101911284794905),
 ('feat_119', 0.03696882049255118),
 ('num_branches', 0.036261569684482159),
 ('avg_molecular_weight_ignore_hydrogen', 0.034743792157084168),
 ('num_num_rotatable_bonds', 0.034155054188435945),
 ('num_h_acceptors', 0.032015032435348818),
 ('fr_NH0', 0.031493181331001853),
 ('feat_218', 0.029912043569035811),
 ('num_double_bonds', 0.026077931847574641),
 ('sssr', 0.025781859746211867),
 ('num_aliphatic_rings', 0.025159472420247332),
 ('fr_Ar_N', 0.024987301878706864),
 ('ring_count', 0.023287298561908011),
 ('fr_pyridine', 0.021056239368878817),
 ('num_aromatic_heterocycles', 0.019672691905820893),
 ('num_aliphatic_heterocycles', 0.018903139748309455),
 ('feat_251', 0.01617755578829347

In [10]:
# select top 25 ranked features
top_feats = zip(*feats[:25])[0]
top_feats

('fr_bicyclic',
 'num_heteroatoms',
 'feat_025',
 'num_valence_electrons',
 'feat_102',
 'exact_molecular_weight',
 'avg_molecular_weight',
 'feat_119',
 'num_branches',
 'avg_molecular_weight_ignore_hydrogen',
 'num_num_rotatable_bonds',
 'num_h_acceptors',
 'fr_NH0',
 'feat_218',
 'num_double_bonds',
 'sssr',
 'num_aliphatic_rings',
 'fr_Ar_N',
 'ring_count',
 'fr_pyridine',
 'num_aromatic_heterocycles',
 'num_aliphatic_heterocycles',
 'feat_251',
 'fr_allylic_oxid',
 'num_aromatic_carbocycles')

In [11]:
# create interaction terms (e.g. x_1 * x_2) for all top features
for i in range(len(top_feats)):
    for j in range(i+1,len(top_feats)):
        feat_i = top_feats[i]
        feat_j = top_feats[j]
        df_train[feat_i+'-'+feat_j] = df_train[feat_i]*df_train[feat_j]

In [12]:
df_train['smiles'] = smiles
df_train['gap'] = Y_train

In [13]:
df_train.to_csv('FINAL_train_25_interactions.csv', index=False)

In [14]:
del df_train

####  Add features for test data

In [15]:
df_test = pd.read_csv("FINAL_test.csv")

In [17]:
# create interaction terms (e.g. x_1 * x_2) for all top features
for i in range(len(top_feats)):
    for j in range(i+1,len(top_feats)):
        feat_i = top_feats[i]
        feat_j = top_feats[j]
        df_test[feat_i+'-'+feat_j] = df_test[feat_i]*df_test[feat_j]

In [18]:
df_test.head()

Unnamed: 0,Id,num_branches,has_benzothiophene,has_carbazole,has_fluorene,num_double_bonds,avg_molecular_weight,exact_molecular_weight,avg_molecular_weight_ignore_hydrogen,num_valence_electrons,...,num_aromatic_heterocycles-num_aliphatic_heterocycles,num_aromatic_heterocycles-feat_251,num_aromatic_heterocycles-fr_allylic_oxid,num_aromatic_heterocycles-num_aromatic_carbocycles,num_aliphatic_heterocycles-feat_251,num_aliphatic_heterocycles-fr_allylic_oxid,num_aliphatic_heterocycles-num_aromatic_carbocycles,feat_251-fr_allylic_oxid,feat_251-num_aromatic_carbocycles,fr_allylic_oxid-num_aromatic_carbocycles
0,1,2,0,0,0,0,409.499,409.045587,398.411,136,...,0,0,0,10,0,0,0,0,0,0
1,2,0,0,0,0,0,352.469,351.991109,344.405,110,...,0,0,0,5,0,0,0,0,0,0
2,3,1,0,0,0,2,514.569,514.948537,501.465,146,...,4,0,0,12,0,0,3,0,0,0
3,4,2,0,0,0,4,376.491,376.10319,360.363,132,...,3,0,0,6,0,0,2,0,0,0
4,5,3,0,0,0,0,569.637,569.844956,559.557,154,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_test.to_csv('FINAL_test_25_interactions.csv', index=False)

In [20]:
del df_test

### 2. Split training data into training set and validation set

In [21]:
df_train = pd.read_csv('FINAL_train_25_interactions.csv')

In [22]:
# Drop the 'smiles' column 
df_train = df_train.drop(['smiles'], axis=1)

# Store gap values
Y_train = df_train.gap.values

# Delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)
X_train = df_train.values
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape

Train features: (999997, 669)
Train gap: (999997,)


In [23]:
# Partition Training Data into Training, Validation
cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

### 3. Model Selection -  Test various models
#### A. Linear Regression

In [24]:
# Fit Linear Regression to cross_X_train and validate it on validations set
LR = LinearRegression()
LR.fit(cross_X_train, cross_Y_train)
LR_pred = LR.predict(cross_X_valid)

In [25]:
LR.coef_
zero_coefs = []
for i in xrange(len(LR.coef_)):
    if LR.coef_[i] == 0:
        zero_coefs.append(i)
print "Number of coefficients that are zero:", len(zero_coefs)
print "Total number of coefficients:", len(LR.coef_)
print zero_coefs

Number of coefficients that are zero: 79
Total number of coefficients: 669
[276, 277, 278, 279, 280, 281, 282, 283, 284, 286, 287, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 302, 303, 304, 305, 306, 307, 309, 310, 313, 314, 315, 316, 317, 318, 319, 321, 322, 323, 324, 325, 326, 327, 328, 329, 331, 332, 333, 334, 335, 336, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 356, 357, 358, 359, 361, 362, 365, 366, 367, 368]


In [26]:
print LR_pred

[ 1.41847547  1.97750314  1.77424319 ...,  1.88201698  1.78367074
  2.14235671]


In [27]:
mean_squared_error(cross_Y_valid, LR_pred)**.5

0.16509869307012576

#### B. Transform data using PCA

In [None]:
pca = PCA(n_components=60)
cross_X_train_transf = pca.fit_transform(cross_X_train)
cross_X_valid_transf = pca.transform(cross_X_valid)

#### No PCA, Random Forest, feature selection

In [28]:
# train rf regressor
rf_est = RandomForestRegressor(n_estimators=128,n_jobs=2, max_features='sqrt')
rf_est.fit(cross_X_train, cross_Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=128, n_jobs=2, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [29]:
rf_pred = rf_est.predict(cross_X_valid)

In [30]:
#RMSE
mean_squared_error(cross_Y_valid, rf_pred)**.5

0.14326511933024585

#### C. Random Forest (extra trees) using PCA

In [None]:
extraTrees_pca = ExtraTreesRegressor(n_estimators=100,n_jobs=2)
tree_est_wPCA = extraTrees_pca.fit(cross_X_train_transf, cross_Y_train)
pca_exTree_pred = tree_est_wPCA.predict(cross_X_valid_transf)

In [None]:
# Calculate RMSE
mean_squared_error(cross_Y_valid, pca_exTree_pred)**.5

#### D1. Lasso

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Set parameters to test
# alphas = np.logspace(-4, -.5, 30) alpha = 0.0001 was best with RMSE of 0.25
alphas = [.5,.1,.01,.001,.0001,.00001]

# Initialize minimums 

minimum_alpha = 100
minimum_RMSE = 100
for alpha in alphas:
    
    # Fit model and predict on validation
    clf = linear_model.Lasso(alpha=alpha)
    clf.fit(cross_X_train,cross_Y_train)
    y_pred = clf.predict(cross_X_valid) 
    
    # Calculate RMSE and update minimum RMSE if possible
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
    if RMSE < minimum_RMSE:
        minimum_RMSE = RMSE
        minimum_alpha = alpha
    
print "minimum RMSE is", minimum_RMSE
print "minimum alpha is",minimum_alpha

#### D2. Lasso using PCA

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Set parameters to test
# alphas = np.logspace(-4, -.5, 30) alpha = 0.0001 was best with RMSE of 0.25
alphas = [.5,.1,.01,.001,.0001,.00001]

# Initialize minimums 

minimum_alpha = 100
minimum_RMSE = 100
for alpha in alphas:
    
    # Fit model and predict on validation
    clf = linear_model.Lasso(alpha=alpha)
    clf.fit(cross_X_train_transf,cross_Y_train)
    y_pred = clf.predict(cross_X_valid_transf) 
    
    # Calculate RMSE and update minimum RMSE if possible
    RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
    if RMSE < minimum_RMSE:
        minimum_RMSE = RMSE
        minimum_alpha = alpha
    
print "minimum RMSE is", minimum_RMSE
print "minimum alpha is",minimum_alpha

#### D1. Elastic Net

In [None]:
# Set parameters to test
alphas = [.5,.1,.01,.001,.0001,.00001] 
ratios = [.5,.1,.01,.001,.0001,.00001]
counter = 0

# Initialize minimums
minimum_alpha = 100
minimum_ratio = 100
minimum_RMSE = 100
for alpha in alphas:
    for ratio in ratios:
        
        # Fit model and predict on validation
        clf = linear_model.ElasticNet(alpha=alpha, l1_ratio=ratio)
        clf.fit(cross_X_train,cross_Y_train)
        y_pred = clf.predict(cross_X_valid) 
        
        # Calculate RMSE and update minimum RMSE if possible
        RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
        if RMSE < minimum_RMSE:
            minimum_RMSE = RMSE
            minimum_alpha = alpha
            minimum_ratio = ratio
    counter +=1
    print counter
print "minimum RMSE is", minimum_RMSE
print "minimum alpha is",minimum_alpha
print "minimum ratio is",minimum_ratio

#### D2. Elastic Net using PCA

In [None]:
# Set parameters to test
alphas = [.5,.1,.01,.001,.0001,.00001] 
ratios = [.5,.1,.01,.001,.0001,.00001]

# Initialize minimums
minimum_alpha = 100
minimum_ratio = 100
minimum_RMSE = 100
for alpha in alphas:
    for ratio in ratios:
        
        # Fit model and predict on validation
        clf = linear_model.ElasticNet(alpha=alpha, l1_ratio=ratio)
        clf.fit(cross_X_train_transf,cross_Y_train)
        y_pred = clf.predict(cross_X_valid_transf) 
        
        # Calculate RMSE and update minimum RMSE if possible
        RMSE = np.sqrt(mean_squared_error(cross_Y_valid, y_pred))
        if RMSE < minimum_RMSE:
            minimum_RMSE = RMSE
            minimum_alpha = alpha
            minimum_ratio = ratio

print "minimum RMSE is", minimum_RMSE
print "minimum alpha is",minimum_alpha
print "minimum ratio is",minimum_ratio

In [None]:
# Remove cross validation datasets
del cross_X_train, cross_X_valid, cross_Y_train, cross_Y_valid

# V. Final Model Construction and Test Set Prediction 
1. Read in training data
2. Read in test data
3. If using PCA, transform training and test data
4. Train model using training data
5. Predict output using test data
6. Write output to csv

In [None]:
df_test = pd.read_csv('FINAL_test.csv')

In [None]:
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
# Drop the 'smiles' column
df_test= df_test.drop(['smiles'], axis=1)
X_test = df_test.values

In [None]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [None]:
pca = PCA(n_components=60)
extraTrees_pca = ExtraTreesRegressor(n_estimators=100,n_jobs=2)
# extraTrees = ExtraTreesRegressor(n_estimators=25,n_jobs=2)

X_transf = pca.fit_transform(X_train)
X_test_transf = pca.transform(X_test)

tree_est_wPCA = extraTrees_pca.fit(X_transf, Y_train)
# tree_estimator = extraTrees.fit(X_train, Y_train)

pca_exTree_pred = tree_est_wPCA.predict(X_test_transf)
# exTree_pred = tree_estimator.predict(X_test)

In [None]:
#write to file