### Indetify Candidates
This code filters all the indicators to those having at least 10000 and present the results in a stacked format

2021-09-14: DXG
First pass filtering data down to 60 potential components that have over 10000 observations per predicted variable. Stacked version of data is in dataSetForModelling.csv

2021-09-21: DXG
Modified to pivot stacked data into wide format with no na's
finalDataSetForModelling has the following columns:
CountryName,Year,Life expectancy at birth, total (years), +60 predictors


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('indicators.csv')

In [3]:
df.shape


(5656458, 6)

In [4]:
df.columns

Index(['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode', 'Year',
       'Value'],
      dtype='object')

In [5]:
df_groupBy = df.groupby('IndicatorName').size().reset_index(name='counts').sort_values('counts')

In [6]:
df_groupBy.to_csv('IndicatorNameGroupedByCount.csv')

In [7]:
#use fairly arbitrary cutoff at 10000 observations per indicator
cutOff = 10000
df_groupByOver10000 = df_groupBy[df_groupBy['counts'] > cutOff]

In [8]:
df_groupByOver10000.to_csv('IndicatorNameGroupedByCountOver' + str(cutOff) + '.csv')

In [9]:
#filter data to the cutoff limit
df_groupByOver10000.shape

(63, 2)

In [10]:
filteredDataset = pd.merge(left=df,right=df_groupByOver10000)

In [11]:
filteredDataset.to_csv('filteredDataset.csv')

In [12]:
lifeExpectancyTotal = 'Life expectancy at birth, total (years)'
lifeExpectancyMale = 'Life expectancy at birth, male (years)'
lifeExpectancyFemale = 'Life expectancy at birth, female (years)'
dependentVariables = [lifeExpectancyTotal,lifeExpectancyMale,lifeExpectancyFemale]

In [13]:
dependentVariables

['Life expectancy at birth, total (years)',
 'Life expectancy at birth, male (years)',
 'Life expectancy at birth, female (years)']

In [14]:
#now need to split dataset into the indepedent and dependent sets
independentDataRaw = filteredDataset[filteredDataset.IndicatorName.isin(dependentVariables)]

In [15]:
dependentData = filteredDataset[~filteredDataset.IndicatorName.isin(dependentVariables)]

In [16]:
independentDataRawPivoted = pd.pivot(independentDataRaw,values='Value',index=['CountryName','CountryCode','Year'], columns='IndicatorName')

In [17]:
independentDataRawPivotedFlattened = independentDataRawPivoted.reset_index(level=[0,1])

In [18]:
independentDataRawPivotedFlattened.columns.to_flat_index()
independentData = pd.DataFrame(independentDataRawPivotedFlattened.to_records())

In [19]:
independentData
dataSetForModelling =  pd.merge(left=dependentData,right=independentData)

In [20]:
dataSetForModelling.to_csv('datasetForFeatureSelection.csv')

In [21]:
cleanedDataSetForModelling = dataSetForModelling.drop(['IndicatorCode', 'counts'], axis=1)

In [22]:
#cleanedDataSetForModelling

In [49]:
finalDataSetForModelling = cleanedDataSetForModelling.pivot(index=['CountryName','Year','Life expectancy at birth, total (years)'], columns='IndicatorName', values='Value')

In [50]:
finalDataSetForModelling = finalDataSetForModelling.reset_index().dropna()

In [51]:
finalDataSetForModelling = finalDataSetForModelling.reset_index().drop(['index'], axis=1)

In [53]:
finalDataSetForModelling.shape
# CountryName,Year,Life expectancy at birth, total (years), +60 predictors

(6259, 63)

### Add column for overall average and average per year

In [62]:
meanByYear = finalDataSetForModelling[['Year','Life expectancy at birth, total (years)']].groupby('Year').mean().reset_index()
meanOverall = finalDataSetForModelling[['Life expectancy at birth, total (years)']].mean()

finalDataSetForModelling['MeanLifeExpetancyOverall'] = meanOverall[0]

meanByYear= meanByYear.rename(columns={'Life expectancy at birth, total (years)':'MeanLifeExpetancyForYear'})

finalDataSetForModelling = pd.merge(left=finalDataSetForModelling,right=meanByYear)

finalDataSetForModelling['AboveAverageLifeExpectancyOverall'] = finalDataSetForModelling['Life expectancy at birth, total (years)']>finalDataSetForModelling['MeanLifeExpetancyOverall']

finalDataSetForModelling['AboveAverageLifeExpectancyByYear'] = finalDataSetForModelling['Life expectancy at birth, total (years)']>finalDataSetForModelling['MeanLifeExpetancyForYear']

finalDataSetForModelling.to_csv("finalDataSetForModelling.csv", index=False)

# Everything below this is just exploratory code
Trying out different modelling scenarios

In [65]:
components = dataSetForModelling['IndicatorName'].unique()

In [22]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [66]:
components[0]

'Adolescent fertility rate (births per 1,000 women ages 15-19)'

In [23]:
model = LinearRegression()

In [82]:
filteredDataset['Value'].values.reshape((-1, 1))

array([[133.56090741],
       [162.87121157],
       [ 46.71675161],
       ...,
       [ 63.7318    ],
       [ 98.1738    ],
       [111.8664    ]])

In [97]:
filteredDataset[4:5] #.values.reshape((-1, 1))

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,counts,"Life expectancy at birth, female (years)","Life expectancy at birth, male (years)","Life expectancy at birth, total (years)"
146,East Asia & Pacific (developing only),EAP,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,75.043631,12485,47.359607,43.834952,45.549658


In [83]:
filteredDataset=dataSetForModelling[dataSetForModelling['IndicatorName']==components[0]]
x=filteredDataset['Value'].values.reshape((-1, 1))
y=filteredDataset[lifeExpectancyTotal]
model.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [84]:
model.score(x,y)

0.5912484149339186

In [77]:
y

0         46.847059
37        62.271795
76        67.823762
108       48.298317
146       45.549658
            ...    
658778    79.624390
658811    73.203341
658854    63.583512
658906    59.237366
658958    55.633000
Name: Life expectancy at birth, total (years), Length: 12099, dtype: float64

# Great stuff
https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.html

In [85]:
import numpy as np
import statsmodels.api as sm

In [86]:
x = sm.add_constant(x)

In [87]:
x


array([[  1.        , 133.56090741],
       [  1.        , 162.87121157],
       [  1.        ,  46.71675161],
       ...,
       [  1.        ,  63.7318    ],
       [  1.        ,  98.1738    ],
       [  1.        , 111.8664    ]])

In [88]:
model = sm.OLS(y, x)

In [89]:
results = model.fit()

In [90]:
results.summary()

0,1,2,3
Dep. Variable:,"Life expectancy at birth, total (years)",R-squared:,0.591
Model:,OLS,Adj. R-squared:,0.591
Method:,Least Squares,F-statistic:,17500.0
Date:,"Tue, 14 Sep 2021",Prob (F-statistic):,0.0
Time:,20:43:28,Log-Likelihood:,-41050.0
No. Observations:,12099,AIC:,82100.0
Df Residuals:,12097,BIC:,82120.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,75.7921,0.116,652.506,0.000,75.564,76.020
x1,-0.1635,0.001,-132.280,0.000,-0.166,-0.161

0,1,2,3
Omnibus:,1596.854,Durbin-Watson:,1.666
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3537.274
Skew:,-0.792,Prob(JB):,0.0
Kurtosis:,5.123,Cond. No.,167.0


In [93]:
results.pvalues[1]

0.0

In [94]:
results.tvalues

const    652.505656
x1      -132.279977
dtype: float64

In [95]:
results.rsquared_adj

0.5912146254336237