## Models two feature sets via linear regression
We have two feature sets that we need to decide on which ones to use for our modelling.

Use linear regression to determine which set performs better in prediction accuracy so that we can make suggest action to be taken based on inference

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve, auc

from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('LassoForwardSelectionDataSetForModelling.csv')
lassoFeatures = pd.read_csv('top10predictorsLasso_cv.csv')['0'].tolist()
forwardSelectionFeatures = pd.read_csv('ForwardSelectionVia5FoldCV.csv')["0"].tolist()

In [3]:
y_qualitative = df['AboveAverageLifeExpectancyByYear']
y_quantitative = df["Life expectancy at birth, total (years)"]

In [4]:
xLasso = df[np.intersect1d(df.columns, lassoFeatures)]
xFowardSelection = df[np.intersect1d(df.columns, forwardSelectionFeatures)]

In [5]:
# Perform cross validation for Lasso features 
from sklearn.model_selection import cross_val_score
# Fit the model
model = LinearRegression()
cv_scores = cross_val_score(model, xLasso, y_quantitative, cv=5)
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.73038043 0.76958794 0.54344284 0.77335337 0.77913073]
cv_scores mean:0.7191790631027105


In [6]:
# Perform cross validation for Forward Selection features 
from sklearn.model_selection import cross_val_score
# Fit the model
model = LinearRegression()
cv_scores = cross_val_score(model, xFowardSelection, y_quantitative, cv=5)
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.73460588 0.76198371 0.73277132 0.76598687 0.78115248]
cv_scores mean:0.7553000492428348


In [7]:
lassoFeatures

['Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Cereal yield (kg per hectare)',
 'Urban population (% of total)',
 'GDP per capita (current US$)',
 'Merchandise exports by the reporting economy, residual (% of total merchandise exports)',
 'Permanent cropland (% of land area)',
 'Merchandise imports by the reporting economy, residual (% of total merchandise imports)',
 'Population density (people per sq. km of land area)',
 'Agricultural land (% of land area)',
 'Arable land (hectares per person)']

In [8]:
forwardSelectionFeatures

['Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Arable land (% of land area)',
 'Arable land (hectares per person)',
 'CO2 emissions from solid fuel consumption (kt)',
 'GDP per capita (current US$)',
 'Merchandise exports by the reporting economy, residual (% of total merchandise exports)',
 'Merchandise trade (% of GDP)',
 'Permanent cropland (% of land area)',
 'Population density (people per sq. km of land area)',
 'Urban population (% of total)']

In [9]:
from sklearn import metrics
modelAll = LinearRegression()
modelAll.fit(xLasso, y_quantitative)
y_pred = modelAll.predict(xLasso)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_quantitative, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_quantitative, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_quantitative, y_pred)))
print('R-Squared:', modelAll.score(xLasso, y_quantitative))
print('VIF must be less than:', 1/(1-modelAll.score(xLasso, y_quantitative)))

Mean Absolute Error: 3.643080463257614
Mean Squared Error: 24.488519452411985
Root Mean Squared Error: 4.948587621979829
R-Squared: 0.7889030119630167
VIF must be less than: 4.73715901538493


In [10]:
# Multicollinearity checks via VIF (An acceptable VIF is if it’s less than the max of 10 and 1/1-R² model)
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [11]:
X_variables = xLasso
vif_data = pd.DataFrame()
vif_data["feature"] = X_variables.columns
vif_data["VIF"] = [variance_inflation_factor(X_variables.values, i) for i in range(len(X_variables.columns))]
vif_data.to_csv("vif_data_lasso.csv")
vif_data


Unnamed: 0,feature,VIF
0,"Adolescent fertility rate (births per 1,000 wo...",2.800811
1,Agricultural land (% of land area),4.695714
2,Arable land (hectares per person),2.097837
3,Cereal yield (kg per hectare),2.579817
4,GDP per capita (current US$),2.077011
5,"Merchandise exports by the reporting economy, ...",2.08133
6,"Merchandise imports by the reporting economy, ...",2.433274
7,Permanent cropland (% of land area),1.699907
8,Population density (people per sq. km of land ...,1.803618
9,Urban population (% of total),5.237028


In [12]:
from sklearn import metrics
modelAll = LinearRegression()
modelfit = modelAll.fit(xFowardSelection, y_quantitative)
y_pred = modelAll.predict(xFowardSelection)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_quantitative, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_quantitative, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_quantitative, y_pred)))
print('R-Squared:', modelAll.score(xFowardSelection, y_quantitative))
print('VIF must be less than:', 1/(1-modelAll.score(xFowardSelection, y_quantitative)))

Mean Absolute Error: 3.669503589781588
Mean Squared Error: 24.671628478279228
Root Mean Squared Error: 4.967054305952295
R-Squared: 0.7873245676671876
VIF must be less than: 4.702000550938653


In [13]:
X_variables = xFowardSelection
vif_data = pd.DataFrame()
vif_data["feature"] = X_variables.columns
vif_data["VIF"] = [variance_inflation_factor(X_variables.values, i) for i in range(len(X_variables.columns))]
vif_data.to_csv("vif_data_forward_selection.csv")
vif_data

Unnamed: 0,feature,VIF
0,"Adolescent fertility rate (births per 1,000 wo...",2.554039
1,Arable land (% of land area),2.83914
2,Arable land (hectares per person),2.047759
3,CO2 emissions from solid fuel consumption (kt),1.057034
4,GDP per capita (current US$),1.90834
5,"Merchandise exports by the reporting economy, ...",1.584972
6,Merchandise trade (% of GDP),2.56076
7,Permanent cropland (% of land area),1.598119
8,Population density (people per sq. km of land ...,2.272673
9,Urban population (% of total),4.830342


In [14]:
import statsmodels.api as sm
est = sm.OLS(y_quantitative,xFowardSelection)
est2 = est.fit()
print(est2.summary())

                                           OLS Regression Results                                           
Dep. Variable:     Life expectancy at birth, total (years)   R-squared (uncentered):                   0.962
Model:                                                 OLS   Adj. R-squared (uncentered):              0.962
Method:                                      Least Squares   F-statistic:                          1.572e+04
Date:                                     Tue, 16 Nov 2021   Prob (F-statistic):                        0.00
Time:                                             18:19:17   Log-Likelihood:                         -24801.
No. Observations:                                     6259   AIC:                                  4.962e+04
Df Residuals:                                         6249   BIC:                                  4.969e+04
Df Model:                                               10                                                  
Covariance Type:   