In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [173]:
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.utils import shuffle

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [82]:
def get_score(prediction, labels):
    print("R2: {}".format(r2_score(prediction,labels)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
    print("RMSLE: {}".format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))

def train_test(estimator, x_train, x_test, y_train, y_test):
    prediction_train = estimator.predict(x_train)
    print(estimator)
    get_score(prediction_train,y_train)
    
    prediction_test = estimator.predict(x_test)
    print("Test")
    get_score(prediction_test,y_test)

In [129]:
train_data = train.copy()
train_labels = train_data.pop('formation_energy_ev_natom')
bandgap_labels = train_data.pop('bandgap_energy_ev')

features = pd.concat([train_data, test], keys = ['train','test'])

In [130]:
features['spacegroup'] = features['spacegroup'].astype(str)

In [178]:
process_features = features.copy()

for col in process_features.dtypes[process_features.dtypes == 'object'].index:
    for_dummy = process_features.pop(col)
    process_features = pd.concat([process_features,pd.get_dummies(for_dummy,prefix = col)],axis = 1)

numeric_features = features.loc[:,['number_of_total_atoms', 'percent_atom_al',
       'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang',
       'lattice_vector_2_ang', 'lattice_vector_3_ang',
       'lattice_angle_alpha_degree', 'lattice_angle_beta_degree',
       'lattice_angle_gamma_degree']]
numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()

#process_features.update(numeric_features_standardized)

In [179]:
train_features = process_features.loc['train'].drop('id', axis=1).select_dtypes(include=[np.number]).values
test_features = process_features.loc['test'].drop('id', axis=1).select_dtypes(include=[np.number]).values

In [180]:
x_train, x_test, y_train, y_test = train_test_split(train_features,train_labels,test_size = 0.1,random_state = 50)

In [181]:
BenchMark = linear_model.Lasso(0.1).fit(x_train,y_train)
train_test(BenchMark,x_train,x_test,y_train,y_test)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
R2: -7.827363851992125
RMSE: 0.09274980760222595
RMSLE: 0.07719163568965441
Test
R2: -7.638773161776379
RMSE: 0.09257861982364421
RMSLE: 0.07740985644178801


In [193]:
RF = ensemble.GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='ls').fit(x_train,y_train)
train_test(RF,x_train,x_test,y_train,y_test)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=15,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=1000, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)
R2: 0.8875809133935136
RMSE: 0.032538603711023475
RMSLE: 0.02578214855706344
Test
R2: 0.8121793980834351
RMSE: 0.041001079355878364
RMSLE: 0.03299590714808735


In [191]:
# param_grid = {'n_estimators':[100,500,1000,5000,10000],'max_depth':[1,5,10,20,25],'learning_rate':[1,0.1,0.01,0.01]}
# RFs = GridSearchCV(estimator = RF,param_grid = param_grid,scoring='neg_mean_squared_error')
# RFs.fit(x_train,y_train)
# RFs.best_params_

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=15,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=1000, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 500, 1000, 5000, 10000], 'max_depth': [1, 5, 10, 20, 25], 'learning_rate': [1, 0.1, 0.01, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [194]:
x_train, x_test, y_train, y_test = train_test_split(train_features,bandgap_labels,test_size = 0.1,random_state = 50)

In [226]:
RFb = ensemble.GradientBoostingRegressor(alpha = 0.5, n_estimators=1000, learning_rate=0.01, max_depth=5, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='ls').fit(x_train,y_train)
train_test(RFb,x_train,x_test,y_train,y_test)

GradientBoostingRegressor(alpha=0.5, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=15,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=2000, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)
R2: 0.9779049237426481
RMSE: 0.1475422736125267
RMSLE: 0.05984173690263628
Test
R2: 0.9437146620950401
RMSE: 0.22905332155988833
RMSLE: 0.09108974043026252


In [227]:
param_grid = {'n_estimators':[500,1000,2000],'alpha':[0.5,0.7,0.9],max_depth = [3,5,7]}
RFs = GridSearchCV(estimator = RFb,param_grid = param_grid,scoring='neg_mean_squared_error')
RFs.fit(x_train,y_train)
RFs.best_params_

SyntaxError: invalid syntax (<ipython-input-227-867bf3f310b8>, line 1)

In [209]:
RF_model = RF.fit(train_features,train_labels)
ans_labels = RF_model.predict(test_features)
ans_labels[ans_labels < 0 ] = 0

In [210]:
RFb_model = RFb.fit(train_features,bandgap_labels)
bandgap_labels = RFb_model.predict(test_features)
bandgap_labels[bandgap_labels < 0] = 0

In [211]:
pd.DataFrame({'id': test.id, 'formation_energy_ev_natom': ans_labels,'bandgap_energy_ev':bandgap_labels}).to_csv('result.csv', index = False)    