In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv('../data/train/train.csv')
test = pd.read_csv('../data/test/test.csv')
print(train.shape,test.shape)

(2400, 14) (600, 12)


In [3]:
def rmsle(m,n):
    return np.sqrt(np.square(np.log(m+1)-np.log(n+1)).mean())

In [4]:
test.head()

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
0,1,33,80.0,0.1875,0.4688,0.3438,10.5381,9.0141,9.6361,89.9997,90.0003,90.0006
1,2,33,80.0,0.75,0.25,0.0,9.8938,8.5014,9.1298,90.0038,90.0023,90.0015
2,3,167,30.0,0.6667,0.1667,0.1667,4.9811,4.9808,13.4799,89.99,90.0109,120.0014
3,4,12,80.0,0.5625,0.4375,0.0,24.337,6.0091,5.762,89.9995,103.8581,90.0002
4,5,12,80.0,0.1875,0.5,0.3125,24.6443,6.2906,6.1589,90.0,104.5929,90.0001


In [5]:
train['alpha_rad'] = np.radians(train['lattice_angle_alpha_degree'])
train['beta_rad'] = np.radians(train['lattice_angle_beta_degree'])
train['gamma_rad'] = np.radians(train['lattice_angle_gamma_degree'])

test['alpha_rad'] = np.radians(test['lattice_angle_alpha_degree'])
test['beta_rad'] = np.radians(test['lattice_angle_beta_degree'])
test['gamma_rad'] = np.radians(test['lattice_angle_gamma_degree'])

def density(df):
    volume = df['lattice_vector_1_ang']*df['lattice_vector_2_ang']*df['lattice_vector_3_ang']*np.sqrt(1 + 2*np.cos(df['alpha_rad'])*np.cos(df['beta_rad'])*np.cos(df['gamma_rad'])-np.cos(df['alpha_rad'])**2-np.cos(df['beta_rad'])**2-np.cos(df['gamma_rad'])**2)
    density = df['number_of_total_atoms']/(df['lattice_vector_1_ang']*df['lattice_vector_2_ang']*df['lattice_vector_3_ang']*np.sqrt(1 + 2*np.cos(df['alpha_rad'])*np.cos(df['beta_rad'])*np.cos(df['gamma_rad'])-np.cos(df['alpha_rad'])**2-np.cos(df['beta_rad'])**2-np.cos(df['gamma_rad'])**2))
    df['volumn'] = volume
    df['density'] = density

In [6]:
v = np.vstack((train[['lattice_vector_1_ang','lattice_vector_2_ang','lattice_vector_3_ang']].values,test[['lattice_vector_1_ang','lattice_vector_2_ang','lattice_vector_3_ang']].values))
pca = PCA().fit(v)
train['PCA'] = pca.transform(train[['lattice_vector_1_ang','lattice_vector_2_ang','lattice_vector_3_ang']])[:, 0]
test['PCA'] = pca.transform(test[['lattice_vector_1_ang','lattice_vector_2_ang','lattice_vector_3_ang']])[:, 0]

In [7]:
y = ['formation_energy_ev_natom','bandgap_energy_ev']
density(train)
density(test)
x1 = train.drop(['id','number_of_total_atoms']+y,axis=1)
y1 = train[y]
x2 = test.drop(['id','number_of_total_atoms'],axis=1)
print(x1.shape)
x1.head()

(2400, 16)


Unnamed: 0,spacegroup,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,alpha_rad,beta_rad,gamma_rad,PCA,volumn,density
0,33,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,1.570842,1.570836,1.570826,2.303089,781.052081,0.102426
1,194,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,1.571121,1.570761,2.094439,-10.413997,782.50011,0.102236
2,227,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,1.587705,1.590393,0.532648,-1.223099,391.227531,0.102242
3,167,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,1.570601,1.571004,2.094425,-4.410966,293.377334,0.102257
4,194,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,1.570727,1.570807,2.094208,-10.70024,944.713843,0.084682


In [8]:
kf = KFold(n_splits=10,shuffle=True,random_state=30)
predict1 = np.zeros((x2.shape[0],kf.n_splits))
predict2 = np.zeros((x2.shape[0],kf.n_splits))
lr = linear_model.RidgeCV(alphas=[0.1,1.0,10.0])

In [9]:
cv = []
t1 = []
t2 = []
te = []
for train_index,valid_index in kf.split(x1):
    xtrain,xvalid = x1.loc[train_index],x1.loc[valid_index]
    ytrain,yvalid = y1.loc[train_index],y1.loc[valid_index]
    lr.fit(xtrain,ytrain[y[0]])
    predict1 = lr.predict(x2)
    p1 = lr.predict(xtrain)
    predicty = lr.predict(xvalid)
    er1 = rmsle(ytrain[y[0]],p1)    
    error = rmsle(yvalid[y[0]],predicty)
    cv.append(error)
    
    xtrain,xvalid = x1.loc[train_index],x1.loc[valid_index]
    ytrain,yvalid = y1.loc[train_index],y1.loc[valid_index]
    lr.fit(xtrain,ytrain[y[1]])
    predict2 = lr.predict(x2)
    p2 = lr.predict(xtrain)
    er2 = rmsle(ytrain[y[1]],p2)
    predicty = lr.predict(xvalid)
    error = rmsle(yvalid[y[1]],predicty)
    cv.append(error)
    te.append(er2)


In [10]:
t = np.mean(te)
print(t)

0.16817905271504552


In [11]:
e1 = np.mean(cv[0])
e2 = np.mean(cv[1])
print(e1,e2)

0.06441856840816385 0.1712376585410556


In [15]:
pre = np.zeros((x2.shape[0],len(y)))
pre[:,0],pre[:,1] = predict1.mean(axis=1),predict2.mean(axis=1)
submit = pd.DataFrame({'id':test['id'],'formation_energy_ev_natom':pre[:,0],'bandgap_energy_ev':pre[:,1]})
submit.to_csv('lr.csv',index=False)