In [1]:
import numpy as np
import pandas as pd 

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('life.csv')

In [5]:
df.columns

Index(['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP',
       'BMI_female', 'life', 'child_mortality', 'Region'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 10 columns):
population         139 non-null float64
fertility          139 non-null float64
HIV                139 non-null float64
CO2                139 non-null float64
BMI_male           139 non-null float64
GDP                139 non-null float64
BMI_female         139 non-null float64
life               139 non-null float64
child_mortality    139 non-null float64
Region             139 non-null object
dtypes: float64(9), object(1)
memory usage: 10.9+ KB


In [7]:
for col in df.columns:
    if df[col].dtype==object:
        df[col]=df[col].astype('category')
        df[col]=df[col].cat.codes

In [8]:
df.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
0,34811059.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5,3
1,19842251.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0,5
2,40381860.0,2.24,0.5,4.78517,27.5017,14646.0,118.8915,75.5,15.4,0
3,2975029.0,1.4,0.1,1.804106,25.35542,7383.0,132.8108,72.5,20.0,2
4,21370348.0,1.96,0.1,18.016313,27.56373,41312.0,117.3755,81.5,5.2,1


In [9]:
df.isnull().sum().sort_values(ascending = False)

Region             0
child_mortality    0
life               0
BMI_female         0
GDP                0
BMI_male           0
CO2                0
HIV                0
fertility          0
population         0
dtype: int64

In [10]:
df.shape

(139, 10)

In [11]:
df_dm = df.copy()

In [12]:
evaluation = pd.DataFrame({'Model': [],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'R-squared (test)':[]})

In [13]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(df_dm,train_size = 0.8,random_state=3)

In [14]:
features = ['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP',
       'BMI_female', 'child_mortality', 'Region']

In [15]:
from sklearn import linear_model
from sklearn import metrics

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
model = LinearRegression()
model.fit(train[features],train[ 'life'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [18]:
pred = model.predict(test[features])

In [19]:
df_dm['life'].mean()

69.60287769784175

In [20]:
rmsesm = float(format(np.sqrt(metrics.mean_squared_error(test[ 'life'],pred)),'.3f'))
rtrsm = float(format(model.score(train[features], train['life']),'.3f'))
rtesm = float(format(model.score(test[features], test['life']),'.3f'))
print ("Average Price for Test Data: {:.3f}".format(test['life'].mean()))


## Adding the evaluation results in evaluation dataframe
r = evaluation.shape[0]
evaluation.loc[r] = ['Simple Linear Regression',rmsesm,rtrsm,rtesm]
evaluation

Average Price for Test Data: 68.793


Unnamed: 0,Model,Root Mean Squared Error (RMSE),R-squared (training),R-squared (test)
0,Simple Linear Regression,3.05,0.9,0.886


In [21]:
df = pd.read_csv('life.csv')
for col in df.columns:
    if df[col].dtype==object:
        df[col]=df[col].astype('category')
        df[col]=df[col].cat.codes
X = df[features]
Y = df['life']

In [22]:
Y.shape

(139,)

In [23]:
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Building the optimal model using Backward Elimination

import statsmodels.formula.api as sm
X = np.append(arr = np.ones((139, 1)).astype(int), values = X, axis = 1)

X_Optimal = X[:, [0,1,2,3,4,5,6,7,8,9]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.899
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,127.4
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,9.49e-60
Time:,19:55:20,Log-Likelihood:,-344.78
No. Observations:,139,AIC:,709.6
Df Residuals:,129,BIC:,738.9
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,103.4746,11.674,8.864,0.000,80.378,126.571
x1,-1.917e-09,2.5e-09,-0.767,0.445,-6.87e-09,3.03e-09
x2,-0.5452,0.391,-1.395,0.165,-1.319,0.228
x3,-0.6113,0.067,-9.153,0.000,-0.743,-0.479
x4,-0.1949,0.074,-2.625,0.010,-0.342,-0.048
x5,0.4094,0.211,1.940,0.055,-0.008,0.827
x6,0.0001,2.73e-05,4.693,0.000,7.41e-05,0.000
x7,-0.3071,0.090,-3.411,0.001,-0.485,-0.129
x8,-0.0932,0.015,-6.250,0.000,-0.123,-0.064

0,1,2,3
Omnibus:,6.885,Durbin-Watson:,1.927
Prob(Omnibus):,0.032,Jarque-Bera (JB):,7.836
Skew:,-0.339,Prob(JB):,0.0199
Kurtosis:,3.944,Cond. No.,5260000000.0


In [24]:
X_Optimal = X[:, [0,2,3,4,5,6,7,8,9]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.898
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,143.7
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,1.02e-60
Time:,19:56:10,Log-Likelihood:,-345.09
No. Observations:,139,AIC:,708.2
Df Residuals:,130,BIC:,734.6
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,100.8029,11.124,9.062,0.000,78.796,122.810
x1,-0.4799,0.381,-1.260,0.210,-1.233,0.274
x2,-0.6064,0.066,-9.136,0.000,-0.738,-0.475
x3,-0.2012,0.074,-2.731,0.007,-0.347,-0.055
x4,0.4390,0.207,2.120,0.036,0.029,0.849
x5,0.0001,2.71e-05,4.800,0.000,7.65e-05,0.000
x6,-0.2930,0.088,-3.330,0.001,-0.467,-0.119
x7,-0.0944,0.015,-6.386,0.000,-0.124,-0.065
x8,0.2797,0.260,1.074,0.285,-0.235,0.795

0,1,2,3
Omnibus:,6.199,Durbin-Watson:,1.941
Prob(Omnibus):,0.045,Jarque-Bera (JB):,6.826
Skew:,-0.317,Prob(JB):,0.0329
Kurtosis:,3.881,Cond. No.,1110000.0


In [25]:
X_Optimal = X[:, [0,2,3,4,5,6,7,8]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,163.9
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,1.36e-61
Time:,19:56:37,Log-Likelihood:,-345.71
No. Observations:,139,AIC:,707.4
Df Residuals:,131,BIC:,730.9
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,96.3158,10.316,9.337,0.000,75.909,116.723
x1,-0.4067,0.375,-1.085,0.280,-1.148,0.335
x2,-0.5958,0.066,-9.072,0.000,-0.726,-0.466
x3,-0.2088,0.073,-2.845,0.005,-0.354,-0.064
x4,0.3621,0.194,1.862,0.065,-0.023,0.747
x5,0.0001,2.54e-05,5.510,0.000,8.99e-05,0.000
x6,-0.2405,0.073,-3.285,0.001,-0.385,-0.096
x7,-0.0930,0.015,-6.311,0.000,-0.122,-0.064

0,1,2,3
Omnibus:,6.734,Durbin-Watson:,1.933
Prob(Omnibus):,0.034,Jarque-Bera (JB):,7.244
Skew:,-0.36,Prob(JB):,0.0267
Kurtosis:,3.856,Cond. No.,1030000.0


In [26]:
X_Optimal = X[:, [0,3,4,5,6,7,8]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.892
Method:,Least Squares,F-statistic:,190.7
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,1.69e-62
Time:,19:57:11,Log-Likelihood:,-346.33
No. Observations:,139,AIC:,706.7
Df Residuals:,132,BIC:,727.2
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,95.4596,10.292,9.275,0.000,75.100,115.819
x1,-0.5836,0.065,-9.013,0.000,-0.712,-0.456
x2,-0.2050,0.073,-2.794,0.006,-0.350,-0.060
x3,0.3902,0.193,2.023,0.045,0.009,0.772
x4,0.0001,2.54e-05,5.454,0.000,8.84e-05,0.000
x5,-0.2446,0.073,-3.343,0.001,-0.389,-0.100
x6,-0.1053,0.009,-11.155,0.000,-0.124,-0.087

0,1,2,3
Omnibus:,6.88,Durbin-Watson:,1.92
Prob(Omnibus):,0.032,Jarque-Bera (JB):,7.887
Skew:,-0.336,Prob(JB):,0.0194
Kurtosis:,3.955,Cond. No.,1030000.0


In [27]:
X_Optimal = X[:, [0,3,4,6,7,8]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.893
Model:,OLS,Adj. R-squared:,0.889
Method:,Least Squares,F-statistic:,222.9
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,8.02e-63
Time:,19:57:55,Log-Likelihood:,-348.45
No. Observations:,139,AIC:,708.9
Df Residuals:,133,BIC:,726.5
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,105.0009,9.255,11.346,0.000,86.696,123.306
x1,-0.5880,0.065,-8.982,0.000,-0.717,-0.458
x2,-0.1794,0.073,-2.455,0.015,-0.324,-0.035
x3,0.0001,2.55e-05,5.725,0.000,9.54e-05,0.000
x4,-0.2420,0.074,-3.270,0.001,-0.388,-0.096
x5,-0.1162,0.008,-14.806,0.000,-0.132,-0.101

0,1,2,3
Omnibus:,4.377,Durbin-Watson:,1.871
Prob(Omnibus):,0.112,Jarque-Bera (JB):,3.936
Skew:,-0.319,Prob(JB):,0.14
Kurtosis:,3.522,Cond. No.,912000.0


In [28]:
X_Optimal = X[:, [0,3,6,7,8]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.889
Model:,OLS,Adj. R-squared:,0.885
Method:,Least Squares,F-statistic:,267.1
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,8.69e-63
Time:,19:58:21,Log-Likelihood:,-351.53
No. Observations:,139,AIC:,713.1
Df Residuals:,134,BIC:,727.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,107.6480,9.362,11.498,0.000,89.131,126.165
x1,-0.5867,0.067,-8.798,0.000,-0.719,-0.455
x2,9.748e-05,1.65e-05,5.922,0.000,6.49e-05,0.000
x3,-0.2633,0.075,-3.519,0.001,-0.411,-0.115
x4,-0.1148,0.008,-14.399,0.000,-0.131,-0.099

0,1,2,3
Omnibus:,4.905,Durbin-Watson:,1.918
Prob(Omnibus):,0.086,Jarque-Bera (JB):,4.456
Skew:,-0.354,Prob(JB):,0.108
Kurtosis:,3.517,Cond. No.,906000.0


In [29]:
X_Optimal = X[:, [0,3,6,8]]
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,life,R-squared:,0.878
Model:,OLS,Adj. R-squared:,0.876
Method:,Least Squares,F-statistic:,324.6
Date:,"Mon, 06 Apr 2020",Prob (F-statistic):,1.63e-61
Time:,19:58:40,Log-Likelihood:,-357.67
No. Observations:,139,AIC:,723.3
Df Residuals:,135,BIC:,735.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,74.7674,0.598,125.081,0.000,73.585,75.950
x1,-0.6397,0.068,-9.458,0.000,-0.773,-0.506
x2,0.0001,1.7e-05,6.173,0.000,7.13e-05,0.000
x3,-0.1261,0.008,-16.584,0.000,-0.141,-0.111

0,1,2,3
Omnibus:,2.979,Durbin-Watson:,1.842
Prob(Omnibus):,0.225,Jarque-Bera (JB):,2.565
Skew:,-0.323,Prob(JB):,0.277
Kurtosis:,3.156,Cond. No.,55500.0


In [33]:

X_Optimal_Train, X_Optimal_Test = train_test_split(X_Optimal,test_size = 0.2, random_state = 0)
model.fit(X_Optimal_Train, Y_Train)

# Predicting the Optimal Test set results

Y_Optimal_Pred = model.predict(X_Optimal_Test)

In [34]:
rmsesm = float(format(np.sqrt(metrics.mean_squared_error(Y_Test,Y_Optimal_Pred)),'.3f'))
print ("Root mean square error is ")
rmsesm

Root mean square error is 


2.853