In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import copy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
data = pd.read_csv("Datasets/earthquake_data.csv")
print(data.columns)

Index(['ISO', 'Country', 'latitude', 'longitude', 'depth', 'mag', 'year',
       'month', 'day', 'Total affected', 'Total deaths',
       'Total damage ('000 US$)', '2017'],
      dtype='object')


In [3]:
data = data[['ISO','latitude','longitude','depth','mag','year','month','day',
                   'Total affected', 'Total deaths',"Total damage ('000 US$)",'2017']]
data.head(3)

Unnamed: 0,ISO,latitude,longitude,depth,mag,year,month,day,Total affected,Total deaths,Total damage ('000 US$),2017
0,AFG,35.169,69.389,62.1,5.0,2001.0,6.0,1.0,270,4,0,0.498
1,AFG,36.429,70.438,209.0,6.3,2002.0,3.0,3.0,3513,150,0,0.498
2,AFG,33.426,69.524,10.0,5.2,2004.0,7.0,18.0,1040,2,0,0.498


In [4]:
X = data[data.columns.difference(['ISO','Total deaths',"Total damage ('000 US$)"])]
y = data['Total deaths'].apply(lambda x: math.log(0.0001+x))

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [6]:
#regressor = LinearRegression()
#regressor.fit(X_train,y_train)

In [7]:
model = sm.OLS(y,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Total deaths,R-squared:,0.289
Model:,OLS,Adj. R-squared:,0.276
Method:,Least Squares,F-statistic:,21.7
Date:,"Wed, 01 Jan 2020",Prob (F-statistic):,6.83e-31
Time:,19:01:59,Log-Likelihood:,-1486.7
No. Observations:,489,AIC:,2991.0
Df Residuals:,480,BIC:,3029.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
2017,-12.5362,2.333,-5.373,0.000,-17.120,-7.952
Total affected,3.151e-07,1.09e-07,2.894,0.004,1.01e-07,5.29e-07
day,0.0128,0.027,0.476,0.634,-0.040,0.066
depth,-0.0243,0.007,-3.541,0.000,-0.038,-0.011
latitude,0.0089,0.013,0.714,0.476,-0.016,0.034
longitude,-0.0028,0.003,-0.861,0.389,-0.009,0.004
mag,2.4935,0.258,9.671,0.000,1.987,3.000
month,-0.0182,0.066,-0.273,0.785,-0.149,0.112
year,-0.0036,0.001,-2.999,0.003,-0.006,-0.001

0,1,2,3
Omnibus:,125.47,Durbin-Watson:,1.936
Prob(Omnibus):,0.0,Jarque-Bera (JB):,35.143
Skew:,-0.408,Prob(JB):,2.34e-08
Kurtosis:,1.972,Cond. No.,21800000.0


## Recursive Feature Elimination #1

In [8]:
X = X[X.columns.difference(['month','day','latitude','longitude'])]

In [9]:
model2 = sm.OLS(y,X).fit()
model2.summary()

0,1,2,3
Dep. Variable:,Total deaths,R-squared:,0.287
Model:,OLS,Adj. R-squared:,0.28
Method:,Least Squares,F-statistic:,38.96
Date:,"Wed, 01 Jan 2020",Prob (F-statistic):,1.27e-33
Time:,19:01:59,Log-Likelihood:,-1487.4
No. Observations:,489,AIC:,2985.0
Df Residuals:,484,BIC:,3006.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
2017,-11.7543,2.216,-5.304,0.000,-16.109,-7.400
Total affected,3.184e-07,1.08e-07,2.944,0.003,1.06e-07,5.31e-07
depth,-0.0229,0.007,-3.401,0.001,-0.036,-0.010
mag,2.4496,0.237,10.326,0.000,1.983,2.916
year,-0.0037,0.001,-3.218,0.001,-0.006,-0.001

0,1,2,3
Omnibus:,114.222,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34.387
Skew:,-0.41,Prob(JB):,3.41e-08
Kurtosis:,1.993,Cond. No.,20800000.0


In [10]:
y_pred = model2.predict(X)

In [11]:
results = pd.DataFrame({'Actual':math.e*y, 'Predicted':math.e*y_pred})
results['Predicted'].astype(int)
results
#neg_index = results[results['Predicted']<=0].index
#results.iloc[neg_index,] = 0

Unnamed: 0,Actual,Predicted
0,3.768,-6.620
1,13.620,-7.130
2,1.884,-2.070
3,0.000,12.903
4,0.000,-0.935
5,6.518,0.347
6,21.001,0.471
7,5.973,-6.120
8,-25.036,-14.911
9,-25.036,-14.911
