In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn import ensemble, tree, linear_model, svm, neural_network
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score,mean_squared_error

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.columns

Index(['id', 'spacegroup', 'number_of_total_atoms', 'percent_atom_al',
       'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang',
       'lattice_vector_2_ang', 'lattice_vector_3_ang',
       'lattice_angle_alpha_degree', 'lattice_angle_beta_degree',
       'lattice_angle_gamma_degree', 'formation_energy_ev_natom',
       'bandgap_energy_ev'],
      dtype='object')

In [5]:
def get_score(prediction, labels):
    print("R2: {}".format(r2_score(prediction,labels)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, labels))))
    print("RMSLE: {}".format(np.sqrt(np.square(np.log(prediction + 1) - np.log(labels + 1)).mean())))

def train_test(estimator, x_train, x_test, y_train, y_test):
    prediction_train = estimator.predict(x_train)
    print(estimator)
    get_score(prediction_train,y_train)
    
    prediction_test = estimator.predict(x_test)
    print("Test")
    get_score(prediction_test,y_test)

In [63]:
train_data = train.copy()
train_labels = train_data.pop('formation_energy_ev_natom')
bandgap_labels = train_data.pop('bandgap_energy_ev')

features = pd.concat([train_data, test], keys = ['train','test'])

In [64]:
features['spacegroup'] = features['spacegroup'].astype(str)
# features['lattice_vector_1_ang'] = 1 / features['lattice_vector_1_ang']
# features['lattice_vector_2_ang'] = 1 / features['lattice_vector_2_ang']
# features['lattice_vector_3_ang'] = 1 / features['lattice_vector_3_ang']
# features['lattice_angle_alpha_degree'] = np.cos(features['lattice_angle_alpha_degree'])
# features['lattice_angle_beta_degree'] = np.cos(features['lattice_angle_beta_degree'])
# features['lattice_angle_gamma_degree'] = np.cos(features['lattice_angle_gamma_degree'])

In [65]:
process_features = features.copy()

for col in process_features.dtypes[process_features.dtypes == 'object'].index:
    for_dummy = process_features.pop(col)
    process_features = pd.concat([process_features,pd.get_dummies(for_dummy,prefix = col)],axis = 1)

numeric_features = features.loc[:,['number_of_total_atoms', 'percent_atom_al',
       'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang',
       'lattice_vector_2_ang', 'lattice_vector_3_ang',
       'lattice_angle_alpha_degree', 'lattice_angle_beta_degree',
       'lattice_angle_gamma_degree']]
numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()

process_features.update(numeric_features_standardized)

In [66]:
features.spacegroup.unique()

array(['33', '194', '227', '167', '206', '12'], dtype=object)

In [67]:
process_features = features.copy()
process_features.update(numeric_features_standardized)
s33 = process_features['spacegroup'] == '227'
process_features = process_features[s33].drop('spacegroup',axis = 1)
s33 = np.array(s33)
train_labels = train_labels[s33[:2400]]
bandgap_labels = bandgap_labels[s33[:2400]]

In [68]:
train_features = process_features.loc['train'].drop('id', axis=1).select_dtypes(include=[np.number]).values
test_features = process_features.loc['test'].drop('id', axis=1).select_dtypes(include=[np.number]).values

In [69]:
x_train, x_test, y_train, y_test = train_test_split(train_features,train_labels,test_size = 0.1,random_state = 50)

In [70]:
GB = ensemble.GradientBoostingRegressor(alpha = 0.9, n_estimators=1000, learning_rate=0.01, max_depth=5, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='ls').fit(x_train,y_train)
train_test(GB,x_train,x_test,y_train,y_test)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
R2: 0.7766099678450804
RMSE: 0.0406163655714419
RMSLE: 0.03146478072221556
Test
R2: 0.5458467789540637
RMSE: 0.05061660859982347
RMSLE: 0.03976410775345464


In [26]:
param_grid = {'n_estimators':[500,1000,2000],'max_depth':[5,10,20],'alpha':[0.5,0.7,0.9]}
RFs = GridSearchCV(estimator = GB,param_grid = param_grid,scoring='neg_mean_squared_error')
RFs.fit(x_train,y_train)
RFs.best_params_

{'alpha': 0.7, 'max_depth': 5, 'n_estimators': 500}

In [71]:
#new_train_features = np.concatenate((train_features,np.array([train_labels]).T),axis = 1)
new_train_features = train_features
x_train, x_test, y_train, y_test = train_test_split(new_train_features,bandgap_labels,test_size = 0.1,random_state = 50)

In [77]:
GBb = ensemble.GradientBoostingRegressor(alpha = 0.9, n_estimators=1000, learning_rate=0.01, max_depth=5, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='ls').fit(x_train,y_train)
train_test(GBb,x_train,x_test,y_train,y_test)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=5,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
R2: 0.8904170436570508
RMSE: 0.24371976840240797
RMSLE: 0.11196456196254775
Test
R2: 0.6911819324888169
RMSE: 0.40775962366667284
RMSLE: 0.17505468358124765


In [59]:
param_grid = {'alpha':[0.3,0.5,0.7,0.9],'n_estimators':[500,700,1000,2000]}
RFs = GridSearchCV(estimator = GBb,param_grid = param_grid,scoring='neg_mean_squared_error')
RFs.fit(x_train,y_train)
RFs.best_params_

{'alpha': 0.5, 'n_estimators': 500}

In [73]:
SVR = svm.SVR(C = 50.0, kernel = 'rbf').fit(x_train,y_train)
train_test(SVR,x_train,x_test,y_train,y_test)

SVR(C=50.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
R2: 0.7809201955201973
RMSE: 0.3483040235430147
RMSLE: 0.15587883599421154
Test
R2: 0.7583670026446518
RMSE: 0.3764829567971585
RMSLE: 0.1684983704067724


In [74]:
RF = ensemble.RandomForestRegressor(n_estimators = 500,n_jobs = -1).fit(x_train,y_train)
train_test(RF,x_train,x_test,y_train,y_test)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
R2: 0.9252650048773761
RMSE: 0.20402335789382883
RMSLE: 0.09305574348937486
Test
R2: 0.652086246753931
RMSE: 0.4344044638438465
RMSLE: 0.1864158067095383


In [22]:
param_grid = {'n_estimators':[500,1000,2000]}
RFs = GridSearchCV(estimator = RF,param_grid = param_grid,scoring='neg_mean_squared_error')
RFs.fit(x_train,y_train)
RFs.best_params_

{'n_estimators': 500}

In [117]:
GB_model = GB.fit(train_features,train_labels)
ans_labels = GB_model.predict(test_features)
ans_labels[ans_labels < 0 ] = 0

In [118]:
GBb_model = GBb.fit(train_features,bandgap_labels)
SVR_model = SVR.fit(train_features,bandgap_labels)
RF_model = RF.fit(train_features,bandgap_labels)
ans_bandgap_labels = (GBb_model.predict(test_features)+SVR_model.predict(test_features))/2
ans_bandgap_labels[ans_bandgap_labels < 0] = 0

In [119]:
pd.DataFrame({'id': test.id, 'formation_energy_ev_natom': ans_labels,'bandgap_energy_ev':ans_bandgap_labels}).to_csv('result.csv', index = False)    