In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
RUNALL = False

In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# randomly sample a portion of df_train
# df_train = df_train.sample(n=10000)

#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [4]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# adding features
from __future__ import print_function
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors

# store smiles
smiles = df_all.smiles

start = time.time()

carbons = []
carbons = np.vstack(df_all.smiles.apply(lambda x: x.count('c')))
df_all['carbons'] = carbons

end = time.time()
print("how long", (end - start))

double = []
double = np.vstack(df_all.smiles.apply(lambda x: x.count('=')))
df_all['double'] = double

end = time.time()
print("how long", (end - start))

single = []
single = np.vstack(df_all.smiles.apply(lambda x: x.count('-')))
df_all['single'] = single

end = time.time()
print("how long", (end - start))

nitrogen = []
nitrogen = np.vstack(df_all.smiles.apply(lambda x: x.count('n')))
df_all['nitrogen'] = nitrogen

end = time.time()
print("how long", (end - start))

oxygen = []
oxygen = np.vstack(df_all.smiles.apply(lambda x: x.count('o')))
df_all['oxygen'] = oxygen

end = time.time()
print("how long", (end - start))

ccccc = []
ccccc = np.vstack(df_all.smiles.apply(lambda x: x.count('ccccc')))
df_all['ccccc'] = ccccc

end = time.time()
print("how long", (end - start))

ccc = []
ccc = np.vstack(df_all.smiles.apply(lambda x: x.count('ccc')))
df_all['ccc'] = ccc

end = time.time()
print("how long", (end - start))

Si = []
Si = np.vstack(df_all.smiles.apply(lambda x: x.count('Si')))
df_all['Si'] = Si

end = time.time()
print("how long", (end - start))

se = []
se = np.vstack(df_all.smiles.astype(str).apply(lambda x: x.count('se')))
df_all['se'] = se

end = time.time()
print("how long", (end - start))

if False:
    mols = df_all.smiles.astype(str).apply(lambda x: Chem.MolFromSmiles(x))
    df_all['mols'] = mols

    # atoms = np.vstack(df_all.mols.apply(lambda x: x.GetNumAtoms()))
    # df_all['atoms'] = pd.DataFrame(atoms)
    # bonds = np.vstack(df_all.mols.apply(lambda x: x.GetNumBonds()))
    # df_all['bonds'] = pd.DataFrame(bonds)

    # aromaticity
    aro = []
    aro = np.vstack(df_all.mols.apply(lambda x: sum(int(x.GetAtomWithIdx(i).GetIsAromatic()) for i in range(x.GetNumAtoms()))))
    df_all['aro'] = pd.DataFrame(aro)

    # adding new features

    # TPSA BAAAAAD
    # tpsa = np.vstack(df_all.smiles.apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x))))
    # df_all['tpsa'] = pd.DataFrame(tpsa)

    # sp3 hybridization
    sp3 = np.vstack(df_all.mols.apply(lambda x: rdMolDescriptors.CalcFractionCSP3(x)))
    df_all['sp3'] = pd.DataFrame(sp3)


#Drop the 'smiles' column
df_all = df_all.drop(['smiles'], axis=1)
#df_all = df_all.drop(['mols'], axis=1)
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print("Train features:", X_train.shape)
print("Train gap:", Y_train.shape)
print("Test features:", X_test.shape)

how long 11.985354900360107
how long 22.383240938186646
how long 33.414939880371094
how long 45.98497676849365
how long 57.08114981651306
how long 68.65900707244873
how long 79.48260879516602
how long 90.68582081794739
how long 103.41392111778259
Train features: (1000000, 265)
Train gap: (1000000,)
Test features: (824230, 265)


In [None]:
hd = np.vstack(mols.apply(lambda x: Descriptors.NumHDonors(x)))
df_all['hd'] = pd.DataFrame(hd)

ha = np.vstack(mols.apply(lambda x: Descriptors.NumHAcceptors(x)))
df_all['ha'] = pd.DataFrame(ha)

aliphcarbo = np.vstack(mols.apply(lambda x: Descriptors.NumAliphaticCarbocycles(x)))
df_all['aliphcarbo'] = pd.DataFrame(aliphcarbo)

aliphhetero = np.vstack(mols.apply(lambda x: Descriptors.NumAliphaticHeterocycles(x)))
df_all['aliphhetero'] = pd.DataFrame(aliphhetero)

aliphrings = np.vstack(mols.apply(lambda x: Descriptors.NumAliphaticRings(x)))
df_all['aliphrings'] = pd.DataFrame(aliphrings)

arocarbos = np.vstack(mols.apply(lambda x: Descriptors.NumAromaticCarbocycles(x)))
df_all['arocarbos'] = pd.DataFrame(arocarbos)

aroheteros = np.vstack(mols.apply(lambda x: Descriptors.NumAromaticHeterocycles(x)))
df_all['aroheteros'] = pd.DataFrame(aroheteros)

arorings = np.vstack(mols.apply(lambda x: Descriptors.NumAromaticRings(x)))
df_all['arorings'] = pd.DataFrame(arorings)

heteros = np.vstack(mols.apply(lambda x: Descriptors.NumHeteroatoms(x)))
df_all['heteros'] = pd.DataFrame(heteros)

radelecs = np.vstack(mols.apply(lambda x: Descriptors.NumRadicalElectrons(x)))
df_all['radelecs'] = pd.DataFrame(radelecs)

rotbonds = np.vstack(mols.apply(lambda x: Descriptors.NumRotatableBonds(x)))
df_all['rotbonds'] = pd.DataFrame(rotbonds)

satcarbos = np.vstack(mols.apply(lambda x: Descriptors.NumSaturatedCarbocycles(x)))
df_all['satcarbos'] = pd.DataFrame(satcarbos)

satheteros = np.vstack(mols.apply(lambda x: Descriptors.NumSaturatedHeterocycles(x)))
df_all['satheteros'] = pd.DataFrame(satheteros)

satrings = np.vstack(mols.apply(lambda x: Descriptors.NumSaturatedRings(x)))
df_all['satrings'] = pd.DataFrame(satrings)

valelecs = np.vstack(mols.apply(lambda x: Descriptors.NumValenceElectrons(x)))
df_all['valelecs'] = pd.DataFrame(valelecs)

In [6]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor


# fn that returns trained linear function to make predictions
def train_and_test_linear_model(X_train, Y_train):
    LR = LinearRegression()
    LR.fit(X_train, Y_train)
    # return the prediction we've trained
    return LR.predict



# RIDGE REG
def train_and_test_ridge_model(alpha):
    def helper(X_train, Y_train):
        reg = linear_model.Ridge(alpha=alpha)
        reg.fit(X_train, Y_train)
        return reg.predict
    return helper


# LASSO REG
def train_and_test_lasso_model(alpha):
    def helper(X_train, Y_train):
        reg = linear_model.Lasso(alpha=alpha)
        reg.fit(X_train, Y_train)
        return reg.predict
    return helper


# ELASTICNET REG
def train_and_test_elastic_net_model(alpha):
    def helper(X_train, Y_train):
        reg = linear_model.ElasticNet(alpha=alpha)
        reg.fit(X_train, Y_train)
        return reg.predict
    return helper

# SVM REG
def train_and_test_svm_model():
    def helper(X_train, Y_train):
        reg = SVR(kernel='rbf', C=1.0, epsilon=0.2)
        reg.fit(X_train, Y_train)
        return reg.predict
    return helper

# RF pred
# fn that returns trained Random Forest to make predictions
def train_and_test_RF_model(X_train, Y_train):
    RF = RandomForestRegressor(n_estimators=20)
    RF.fit(X_train, Y_train)
    # return the prediction we've trained
    return RF.predict

In [7]:
from training_and_validation import select_best_model

In [None]:
if RUNALL:
    select_best_model(X_train, Y_train, 4, train_and_test_linear_model)

In [None]:
select_best_model(X_train, Y_train, 2, train_and_test_ridge_model(0.1))

In [None]:
if RUNALL:
    select_best_model(X_train, Y_train, 4, train_and_test_lasso_model(0.1))

In [None]:
if RUNALL:
    select_best_model(X_train, Y_train, 4, train_and_test_elastic_net_model(0.1))

In [None]:
if RUNALL:
    select_best_model(X_train, Y_train, 4, train_and_test_svm_model())

In [8]:
if RUNALL or True:
    select_best_model(X_train, Y_train, 2, train_and_test_RF_model)

next validation set
training
got a prediction!
next validation set
training
got a prediction!
Final error 0.17542562939217032


In [None]:
if RUNALL:
    select_best_model(X_train_red, Y_train, 4, train_and_test_linear_model)

In [None]:
if RUNALL:
    select_best_model(X_train_red, Y_train, 4, train_and_test_RF_model)

In [None]:
if RUNALL:
    select_best_model(X_train_red, Y_train, 4, train_and_test_svm_model())

In [9]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [10]:
# write a file for ridge regression
rid_reg = linear_model.Ridge(alpha=0.1)
rid_reg.fit(X_train, Y_train)
rid_reg_pred = rid_reg.predict(X_test)
write_to_file("rid_reg_correct.csv", rid_reg_pred)
print("starting RF")
# write a file for RF
rf = RandomForestRegressor(n_estimators=20)
rf.fit(X_train, Y_train)
rf_pred = rf.predict(X_test)
write_to_file("rf_correct.csv", rf_pred)

starting RF
