In [3]:
#https://github.com/wolverton-research-group/qmpy/blob/master/qmpy/data/thermodata/ssub.dat
import pandas as pd
df=pd.read_csv('ssub.dat',sep=' ',header=None)
df.columns=['formula','formula_energy']

In [4]:
df

Unnamed: 0,formula,formula_energy
0,AlKO6Si2,-3.144727e+00
1,Al2H4O9Si2,-2.500142e+00
2,Al2H4O9Si2,-2.499380e+00
3,AlHO2,-2.589914e+00
4,AlHO2,-2.566077e+00
...,...,...
2085,AsS,-1.763221e-01
2086,As7Re3,-9.887016e-02
2087,As8Ni11,-4.222273e-01
2088,Au,1.605726e-09


In [5]:
#https://stackoverflow.com/questions/12497402/remove-duplicates-by-columns-a-keeping-the-row-with-the-highest-value-in-column
df=df.sort_values('formula_energy', ascending=True).drop_duplicates('formula').sort_index()
df=df.sample(frac=1)
df['id']=df.index

In [6]:
df

Unnamed: 0,formula,formula_energy,id
465,CaO4W,-2.841796e+00,465
1197,Ho,1.766051e-09,1197
1759,PbTe,-3.555857e-01,1759
1454,MoS2,-9.540103e-01,1454
218,BeH2,-6.564038e-02,218
...,...,...,...
1482,NZr,-1.892842e+00,1482
773,CrLiO2,-2.404511e+00,773
251,BiNa3,-4.946426e-01,251
328,Br4Ge,-7.213532e-01,328


In [7]:
df.to_csv('ssub.csv',index=False)

In [8]:
from jarvis.db.jsonutils import dumpjson
#for jarvis-tools
mem=[]
for i,ii in df.iterrows():
    info={}
    info['id']=ii['id']
    info['formula_energy']=ii['formula_energy']
    info['formula']=ii['formula']
    mem.append(info)
dumpjson(filename='ssub.json',data=mem)

In [9]:
test_size = 0.2
n_train = int(len(df)*(1-test_size))
n_test = int(len(df)*test_size)
#for leaderboard
mem={}
train={}
test={}
train_df = df[:n_train]
test_df=df[-n_test:]
for i,ii in train_df.iterrows():
    train[str(ii['id'])]=ii['formula_energy']
for i,ii in test_df.iterrows():
    test[str(ii['id'])]=ii['formula_energy']
mem['train']=train
mem['test']=test
fname='ssub_formula_energy.json'
dumpjson(data=mem,filename=fname)

In [10]:
len(train_df)

1381

In [11]:
from jarvis.ai.descriptors.elemental import get_element_fraction_desc
from jarvis.ai.descriptors.cfid import CFID,get_chem_only_descriptors
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

lgbm = lgb.LGBMRegressor(
    # device="gpu",
    n_estimators=1170,
    learning_rate=0.15,
    num_leaves=273,
)

X_train=[]
y_train=[]
X_test=[]
y_test=[]
train_ids=[]
test_ids=[]
for i,ii in train_df.iterrows():
    desc=get_element_fraction_desc(ii['formula'])
    #desc=get_chem_only_descriptors(ii['composition'])[0]
    X_train.append(desc)
    y_train.append(ii['formula_energy'])
    train_ids.append(ii['id'])

    
for i,ii in test_df.iterrows():
    desc=get_element_fraction_desc(ii['formula'])
    #desc=get_chem_only_descriptors(ii['composition'])[0]
    X_test.append(desc)
    y_test.append(ii['formula_energy'])
    test_ids.append(ii['id'])
    
X_train=np.array(X_train,dtype='float')
y_train=np.array(y_train,dtype='float')
X_test=np.array(X_test,dtype='float')
y_test=np.array(y_test,dtype='float')
lgbm.fit(X_train,y_train)
pred=lgbm.predict(X_test)
print (mean_absolute_error(y_test,pred))

0.38072016830809796


In [12]:
f=open('SinglePropertyPrediction-test-formula_energy-ssub-AI-mae.csv','w')
f.write('id,target,prediction\n')
for i,j,k in zip(test_ids,y_test,pred):
    line=str(i)+','+str(j)+','+str(k)+'\n'
    f.write(line)
f.close()