In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
from sklearn import ensemble, tree, linear_model, svm
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.utils import shuffle

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
def get_score(prediction, labels):
    print("R2: {}".format(r2_score(prediction,labels)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
    print("RMSLE: {}".format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))

def train_test(estimator, x_train, x_test, y_train, y_test):
    prediction_train = estimator.predict(x_train)
    print(estimator)
    get_score(prediction_train,y_train)
    
    prediction_test = estimator.predict(x_test)
    print("Test")
    get_score(prediction_test,y_test)

In [5]:
train_data = train.copy()
train_labels = train_data.pop('formation_energy_ev_natom')
bandgap_labels = train_data.pop('bandgap_energy_ev')

features = pd.concat([train_data, test], keys = ['train','test'])

In [6]:
features['spacegroup'] = features['spacegroup'].astype(str)

In [7]:
process_features = features.copy()

for col in process_features.dtypes[process_features.dtypes == 'object'].index:
    for_dummy = process_features.pop(col)
    process_features = pd.concat([process_features,pd.get_dummies(for_dummy,prefix = col)],axis = 1)

numeric_features = features.loc[:,['number_of_total_atoms', 'percent_atom_al',
       'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang',
       'lattice_vector_2_ang', 'lattice_vector_3_ang',
       'lattice_angle_alpha_degree', 'lattice_angle_beta_degree',
       'lattice_angle_gamma_degree']]
numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()

#process_features.update(numeric_features_standardized)

In [8]:
train_features = process_features.loc['train'].drop('id', axis=1).select_dtypes(include=[np.number]).values
test_features = process_features.loc['test'].drop('id', axis=1).select_dtypes(include=[np.number]).values

In [55]:
x_train, x_test, y_train, y_test = train_test_split(train_features,train_labels,test_size = 0.1,random_state = 50)

In [56]:
RF = ensemble.GradientBoostingRegressor(alpha = 0.9, n_estimators=1000, learning_rate=0.01, max_depth=5, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='ls').fit(x_train,y_train)
train_test(RF,x_train,x_test,y_train,y_test)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
R2: 0.8881046813832899
RMSE: 0.0324682528285982
RMSLE: 0.025724706037962744
Test
R2: 0.8103864529808296
RMSE: 0.04119242203416187
RMSLE: 0.033124506666555725


In [191]:
# param_grid = {'n_estimators':[100,500,1000,5000,10000],'max_depth':[1,5,10,20,25],'learning_rate':[1,0.1,0.01,0.01]}
# RFs = GridSearchCV(estimator = RF,param_grid = param_grid,scoring='neg_mean_squared_error')
# RFs.fit(x_train,y_train)
# RFs.best_params_

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=15,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=1000, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 500, 1000, 5000, 10000], 'max_depth': [1, 5, 10, 20, 25], 'learning_rate': [1, 0.1, 0.01, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [61]:
new_train_features = np.concatenate((train_features,np.array([train_labels]).T),axis = 1)
x_train, x_test, y_train, y_test = train_test_split(new_train_features,bandgap_labels,test_size = 0.1,random_state = 50)

In [62]:
RFb = ensemble.GradientBoostingRegressor(alpha = 0.9, n_estimators=1000, learning_rate=0.01, max_depth=5, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='ls').fit(x_train,y_train)
train_test(RFb,x_train,x_test,y_train,y_test)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
R2: 0.9826910183289524
RMSE: 0.13081040222772028
RMSLE: 0.053387586742498126
Test
R2: 0.9612110786137484
RMSE: 0.19118835059370004
RMSLE: 0.07588509516235904


In [63]:
SVR = svm.SVR(C = 50.0).fit(x_train,y_train)
train_test(SVR,x_train,x_test,y_train,y_test)

SVR(C=50.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
R2: 0.9701894155656449
RMSE: 0.1716588460239592
RMSLE: 0.06864588168516995
Test
R2: 0.9556059941941366
RMSE: 0.2059955899039201
RMSLE: 0.0798838855147776


In [228]:
# param_grid = {'n_estimators':[500,1000,2000],'alpha':[0.5,0.7,0.9],"max_depth":[3,5,7]}
# RFs = GridSearchCV(estimator = RFb,param_grid = param_grid,scoring='neg_mean_squared_error')
# RFs.fit(x_train,y_train)
# RFs.best_params_

{'alpha': 0.9, 'max_depth': 3, 'n_estimators': 2000}

In [64]:
RF_model = RF.fit(train_features,train_labels)
ans_labels = RF_model.predict(test_features)
ans_labels[ans_labels < 0 ] = 0

In [66]:
RFb_model = RFb.fit(train_features,bandgap_labels)
SVR_model = SVR.fit(train_features,bandgap_labels)
ans_bandgap_labels = (RFb_model.predict(test_features)+SVR_model.predict(test_features))/2
ans_bandgap_labels[ans_bandgap_labels < 0] = 0

In [68]:
pd.DataFrame({'id': test.id, 'formation_energy_ev_natom': ans_labels,'bandgap_energy_ev':ans_bandgap_labels}).to_csv('result.csv', index = False)    