In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem

In [None]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv", nrows=100)
df_test = pd.read_csv("test.csv")
print("ok")

In [None]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [None]:
#merge train and data so we can engineer features
df_all = pd.concat((df_train, df_test), axis=0)
smiles_len = len(df_all)
mol_all = [Chem.MolFromSmiles(x) for x in df_all.smiles.astype(str)]
print("ok")

In [5]:
SSSR_len = np.vstack([Chem.GetSSSR(x) for x in mol_all])
df_all['SSSR_len'] = pd.DataFrame(SSSR_len)
print(SSSR_len)

In [None]:
func_group_list = ["[CX4]", "[$([CX2](=C)=C)]", "[$([CX3]=[CX3])]","[CX3]=[OX1]",
"[OX1]=CN", "[CX3](=[OX1])O","[CX3](=[OX1])[F,Cl,Br,I]","[NX3][CX3](=[OX1])[#6]",
"[CX3H1](=O)[#6]","[CX3](=[OX1])(O)O","[CX3](=O)[OX2H1]","[#6][CX3](=O)[#6]",
"[CX3](=O)[OX1H0-,OX2H1]","[OD2]([#6])[#6]", "[H]", "[H+]", "[+H]","[NX3;H2,H1;!$(NC=O)]",
"[NX3,NX4+][CX4H]([*])[CX3](=[OX1])[O,N]","[NX3][CX3]=[CX3]","[$(*-[NX2-]-[NX2+]#[NX1]),$(*-[NX2]=[NX2+]=[NX1-])]",
"[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)] ","[NX1]#[CX2]","[OX2H]","[OX2H][#6X3]=[#6]","[#6][OX2H]",
"[OX2H][CX3]=[OX1]","[OX2,OX1-][OX2,OX1-]","[S-][CX3](=S)[#6]", "[#16X2H]", "[SX2]", "[NX3][CX3]=[SX1]","[#16X2H0]",
"[$([#16X3](=[OX1])[OX2H0]),$([#16X3+]([OX1-])[OX2H0])]","[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",
"[#6][F,Cl,Br,I]","[F,Cl,Br,I]", "[F,Cl,Br,I].[F,Cl,Br,I].[F,Cl,Br,I]","[CX3](=[OX1])[F,Cl,Br,I]","[$([cX2+](:*):*)]",
"[$([cX3](:*):*),$([cX2+](:*):*)] ","[$([cX3](:*):*),$([cX2+](:*):*),$([CX3]=*),$([CX2+]=*)] ",
"[$([nX3](:*):*),$([nX2](:*):*),$([#7X2]=*),$([NX3](=*)=*),$([#7X3+](-*)=*),$([#7X3+H]=*)]",
"[$([NX4+]),$([NX3]);!$(*=*)&!$(*:*)]","[$([#1X1][$([NX4+]),$([NX3]);!$(*=*)&!$(*:*)])]","[$([SX3]=N)]", "[$([NX1]#*)]",
"[R0;D2][R0;D2][R0;D2][R0;D2]","[cR1]1[cR1][cR1][cR1][cR1][cR1]1","[sX2r5]","*/,\[R]=;@[R]/,\*","[cR1]1[cR1][cR1][cR1][cR1][cR1]1.[cR1]1[cR1][cR1][cR1][cR1][cR1]1",
"c12ccccc1cccc2","[!H0;F,Cl,Br,I,N+,$([OH]-*=[!#6]),+]","[CX3](=O)[OX2H1]","[$([OH]-*=[!#6])]",
"[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H,OX1H0-]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H,OX1H0-])]",
"[CX3](=[OX1])[F,Cl,Br,I]", "[NX2-]","[OX2H+]=*","[OX3H2+]", "[$([cX2+](:*):*)]", "[$([NX1-]=[NX2+]=[NX1-]),$([NX1]#[NX2+]-[NX1-2])]", "[+1]~*~*~[-1]",
"[#6,#7;R0]=[#8]", "[!$([#6,H0,-,-2,-3])]", "[!H0;#7,#8,#9]", "[O,N;!H0]-*~*-*=[$([C,N;R0]=O)]", "[#6;X3v3+0]", "[#7;X2v4+0]"]

In [7]:
for term in func_group_list:
    smarts = Chem.MolFromSmarts(term)
    rel_count = np.vstack([len(Chem.Mol.GetSubstructMatches(x, smarts, uniquify = True)) for x in mol_all])
    # adds these guys to the data table, len might be broken
    df_all['rel_count_' + term] = pd.DataFrame(rel_count)

'\nExample Feature Engineering\n\nthis calculates the length of each smile string and adds a feature column with those lengths\nNote: this is NOT a good feature and will result in a lower score!\n'

In [8]:
morgan = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=1024) for x in mol_all]
for i in range(1024):
    morgVec = np.vstack(morgan[x][i] for x in range(len(mol_all)))
    df_all['morgVec'+str(i)] = pd.Dataframe(morgVec)

Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)
