### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

### Training

In [2]:
""" 
Read in train data as Pandas DataFrames
"""
df = pd.read_csv("train_2048.csv")

In [3]:
"""
Obtain training features and labels
"""

Y = df.gap.values
X = df.drop(['gap', 'smiles'], axis=1).values

In [4]:
"""
Check data size
"""
print ("Train features:", X.shape)
print ("Train gap:", Y.shape)

Train features: (1000000, 2048)
Train gap: (1000000,)


In [5]:
"""
Train regressor
"""
regressor = RandomForestRegressor(n_jobs=4, verbose=1)
regressor.fit(X, Y)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 23.4min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

### Predicting

In [6]:
"""
Read in test data as Pandas dataframe
"""
df = pd.read_csv("test_2048.csv")

In [7]:
"""
Obtain testing data features
"""
X = df.drop(['Id', 'smiles'], axis=1).values

In [8]:
"""
Check data size
"""
print ("Test features:", X.shape)

Test features: (824230, 2048)


In [9]:
"""
Predict and write to file
"""
pred = regressor.predict(X)
write_to_file("2048RF.csv", pred)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    5.2s finished
