## 1. Import necessary library ##

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chemparse as cp
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## 2. Preprocessing ##

* Import data

In [12]:
raw_data = pd.read_csv('data/lithium-ion batteries.csv')

In [13]:
raw_data.head()

Unnamed: 0,Materials Id,Formula,Spacegroup,Formation Energy (eV),E Above Hull (eV),Band Gap (eV),Nsites,Density (gm/cc),Volume,Has Bandstructure,Crystal System
0,mp-849394,Li2MnSiO4,Pc,-2.699,0.006,3.462,16,2.993,178.513,True,monoclinic
1,mp-783909,Li2MnSiO4,P21/c,-2.696,0.008,2.879,32,2.926,365.272,True,monoclinic
2,mp-761311,Li4MnSi2O7,Cc,-2.775,0.012,3.653,28,2.761,301.775,True,monoclinic
3,mp-761598,Li4Mn2Si3O10,C2/c,-2.783,0.013,3.015,38,2.908,436.183,True,monoclinic
4,mp-767709,Li2Mn3Si3O10,C2/c,-2.747,0.016,2.578,36,3.334,421.286,True,monoclinic


* Strip and one-hot encode

In [14]:
data = raw_data.drop(["Materials Id","Spacegroup"], axis=1)

In [15]:
chrystal_sys_list = data["Crystal System"].unique()
for idx,sys in enumerate(chrystal_sys_list):
    print("Encode {} as {}".format(sys,idx))
    data["Crystal System"].replace(sys,idx,inplace=True)
data.head()

Encode monoclinic as 0
Encode orthorhombic as 1
Encode triclinic as 2


Unnamed: 0,Formula,Formation Energy (eV),E Above Hull (eV),Band Gap (eV),Nsites,Density (gm/cc),Volume,Has Bandstructure,Crystal System
0,Li2MnSiO4,-2.699,0.006,3.462,16,2.993,178.513,True,0
1,Li2MnSiO4,-2.696,0.008,2.879,32,2.926,365.272,True,0
2,Li4MnSi2O7,-2.775,0.012,3.653,28,2.761,301.775,True,0
3,Li4Mn2Si3O10,-2.783,0.013,3.015,38,2.908,436.183,True,0
4,Li2Mn3Si3O10,-2.747,0.016,2.578,36,3.334,421.286,True,0


In [16]:
nn = data["Formula"].apply(cp.parse_formula)
nn = pd.json_normalize(nn)
nn = nn.fillna(0)
nn.head()

Unnamed: 0,Li,Mn,Si,O,Fe,Co
0,2.0,1.0,1.0,4.0,0.0,0.0
1,2.0,1.0,1.0,4.0,0.0,0.0
2,4.0,1.0,2.0,7.0,0.0,0.0
3,4.0,2.0,3.0,10.0,0.0,0.0
4,2.0,3.0,3.0,10.0,0.0,0.0


In [17]:
data=data.join(nn)
data.head()

Unnamed: 0,Formula,Formation Energy (eV),E Above Hull (eV),Band Gap (eV),Nsites,Density (gm/cc),Volume,Has Bandstructure,Crystal System,Li,Mn,Si,O,Fe,Co
0,Li2MnSiO4,-2.699,0.006,3.462,16,2.993,178.513,True,0,2.0,1.0,1.0,4.0,0.0,0.0
1,Li2MnSiO4,-2.696,0.008,2.879,32,2.926,365.272,True,0,2.0,1.0,1.0,4.0,0.0,0.0
2,Li4MnSi2O7,-2.775,0.012,3.653,28,2.761,301.775,True,0,4.0,1.0,2.0,7.0,0.0,0.0
3,Li4Mn2Si3O10,-2.783,0.013,3.015,38,2.908,436.183,True,0,4.0,2.0,3.0,10.0,0.0,0.0
4,Li2Mn3Si3O10,-2.747,0.016,2.578,36,3.334,421.286,True,0,2.0,3.0,3.0,10.0,0.0,0.0


## 3. Gradient Boost ##

In [18]:
chem_formula = data['Formula']
chem_formula

0            Li2MnSiO4
1            Li2MnSiO4
2           Li4MnSi2O7
3         Li4Mn2Si3O10
4         Li2Mn3Si3O10
            ...       
334       Li6Co(SiO4)2
335       LiCo3(SiO4)2
336    Li5Co4(Si3O10)2
337           LiCoSiO4
338      Li3Co2(SiO4)2
Name: Formula, Length: 339, dtype: object

In [19]:
X = data.drop(["Crystal System","Formula"], axis=1)
X

Unnamed: 0,Formation Energy (eV),E Above Hull (eV),Band Gap (eV),Nsites,Density (gm/cc),Volume,Has Bandstructure,Li,Mn,Si,O,Fe,Co
0,-2.699,0.006,3.462,16,2.993,178.513,True,2.0,1.0,1.0,4.0,0.0,0.0
1,-2.696,0.008,2.879,32,2.926,365.272,True,2.0,1.0,1.0,4.0,0.0,0.0
2,-2.775,0.012,3.653,28,2.761,301.775,True,4.0,1.0,2.0,7.0,0.0,0.0
3,-2.783,0.013,3.015,38,2.908,436.183,True,4.0,2.0,3.0,10.0,0.0,0.0
4,-2.747,0.016,2.578,36,3.334,421.286,True,2.0,3.0,3.0,10.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,-2.545,0.071,2.685,17,2.753,171.772,True,6.0,0.0,2.0,8.0,0.0,1.0
335,-2.250,0.076,0.005,42,3.318,552.402,True,1.0,0.0,2.0,8.0,0.0,3.0
336,-2.529,0.082,0.176,35,2.940,428.648,True,5.0,0.0,6.0,20.0,0.0,4.0
337,-2.348,0.087,1.333,14,2.451,214.044,True,1.0,0.0,1.0,4.0,0.0,1.0


In [20]:
y = data['Crystal System']
y

0      0
1      0
2      0
3      0
4      0
      ..
334    2
335    2
336    2
337    2
338    2
Name: Crystal System, Length: 339, dtype: int64

In [29]:
# define the grid of values to search
grid = dict()
grid['n_estimators'] = list(range(10,100,2))
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.25, 0.3, 0.5]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 4, 5]


In [30]:
# define the model with default hyperparameters
model = GradientBoostingClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=12)

In [27]:
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1_macro')
# execute the grid search
grid_result = grid_search.fit(X, y)

In [28]:
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.709186 using {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.193841 (0.001812) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.193841 (0.001812) with: {'learning_rate': 0.0001,