In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.tsa.holtwinters import Holt, ExponentialSmoothing, SimpleExpSmoothing
from sklearn import linear_model

In [3]:
#Load Base Dataser
demo = pd.read_csv('demographics.csv')
pop = pd.read_csv('pop.csv')
demo.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female total,Male 0 - 4,Male 5 - 11,Male 12 - 17,Male 18 - 59,Male 60,Male total,Total
0,2011,Afghanistan,AFG,203989,20,141,661973,39839,1520434,214818,10,188,690934,68179,1634982,4064481
1,2011,Antigua and Barbuda,ATG,0,0,0,6,0,6,0,0,0,6,0,6,32
2,2011,Azerbaijan,AZE,27558,5,5,172575,35036,310900,27976,5,0,164055,34770,299230,615364
3,2011,Burundi,BDI,17317,0,40,54232,4126,169244,17495,0,44,53300,4114,162975,342577
4,2011,Burkina Faso,BFA,5,0,0,55,0,115,5,0,0,140,0,216,1262


In [4]:
demo_scaled = demo.join(pop.set_index('Country Code'), on = 'Country of origin (ISO)')

In [5]:
df = demo_scaled[['Year', 'Country of origin','Country of origin (ISO)', 'Female total','Male total', 'Total', '2019']]
df.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Female total,Male total,Total,2019
0,2011,Afghanistan,AFG,1520434,1634982,4064481,38041754.0
1,2011,Antigua and Barbuda,ATG,6,6,32,97118.0
2,2011,Azerbaijan,AZE,310900,299230,615364,10023318.0
3,2011,Burundi,BDI,169244,162975,342577,11530580.0
4,2011,Burkina Faso,BFA,115,216,1262,20321378.0


In [6]:
demo_scaled['female'] = demo_scaled['Female total'] / (demo_scaled['2019']/1000000)
demo_scaled['male'] = demo_scaled['Male total'] / (demo_scaled['2019']/1000000)
demo_scaled['total'] = demo_scaled['Total'] / (demo_scaled['2019']/1000000)


demo_scaled = demo_scaled.rename(columns={'2019':'pop'})
demo_scaled.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Female 0 - 4,Female 5 - 11,Female 12 - 17,Female 18 - 59,Female 60,Female total,Male 0 - 4,...,Male 12 - 17,Male 18 - 59,Male 60,Male total,Total,Country Name,pop,female,male,total
0,2011,Afghanistan,AFG,203989,20,141,661973,39839,1520434,214818,...,188,690934,68179,1634982,4064481,Afghanistan,38041754.0,39967.505179,42978.617653,106842.62876
1,2011,Antigua and Barbuda,ATG,0,0,0,6,0,6,0,...,0,6,0,6,32,Antigua and Barbuda,97118.0,61.780514,61.780514,329.496077
2,2011,Azerbaijan,AZE,27558,5,5,172575,35036,310900,27976,...,0,164055,34770,299230,615364,Azerbaijan,10023318.0,31017.67299,29853.38787,61393.243235
3,2011,Burundi,BDI,17317,0,40,54232,4126,169244,17495,...,44,53300,4114,162975,342577,Burundi,11530580.0,14677.839276,14134.15457,29710.300783
4,2011,Burkina Faso,BFA,5,0,0,55,0,115,5,...,0,140,0,216,1262,Burkina Faso,20321378.0,5.659065,10.6292,62.102088


In [7]:
PPP = pd.read_csv('PPP.csv')
PPP.head()

Unnamed: 0,ISO,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,ABW,24985.99328,24713.69805,26189.43551,26647.9381,27980.8807,28281.35048,29007.693,,
1,AFG,591.162759,641.871479,637.165523,613.856689,578.466353,509.218661,519.884773,493.750418,507.103432
2,AGO,4615.468028,5100.095808,5254.882338,5408.410496,4166.979684,3506.072885,4095.812942,3289.646664,2790.726615
3,ALB,4437.142885,4247.629984,4413.060861,4578.631994,3952.801215,4124.055726,4531.020806,5284.380184,5353.244856
4,AND,43335.32886,38686.46126,39538.76672,41303.92937,35762.52307,37474.66541,38962.88035,41793.05526,40886.39116


In [68]:
afg_ppp = PPP[PPP['ISO'] == 'AFG'].drop(columns=PPP.columns[0]).T
afg_ppp=np.array(afg_ppp)
afg_ppp

array([[591.162759 ],
       [641.8714792],
       [637.1655232],
       [613.8566892],
       [578.4663529],
       [509.2186613],
       [519.8847731],
       [493.7504181],
       [507.1034319]])

In [69]:
GDP = pd.read_csv('GDP.csv')
GDP.head()

Unnamed: 0,Country Name,ISO,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,2549721000.0,2534637000.0,2701676000.0,2765363000.0,2919553000.0,2965922000.0,3056425000.0,,
1,Afghanistan,AFG,17804290000.0,20001600000.0,20561070000.0,20484890000.0,19907110000.0,18017750000.0,18869950000.0,18353880000.0,19291100000.0
2,Angola,AGO,111790000000.0,128053000000.0,136710000000.0,145712000000.0,116194000000.0,101124000000.0,122124000000.0,101353000000.0,88815700000.0
3,Albania,ALB,12890770000.0,12319830000.0,12776220000.0,13228140000.0,11386850000.0,11861200000.0,13019690000.0,15147020000.0,15279180000.0
4,Andorra,AND,3629204000.0,3188809000.0,3193704000.0,3271808000.0,2789870000.0,2896679000.0,3000181000.0,3218316000.0,3154058000.0


In [70]:
afg_gdp = GDP[GDP['ISO'] == 'AFG'].drop(columns=GDP.columns[[0,1]]).T
afg_gdp=np.array(afg_gdp)
afg_gdp

array([[1.78042930e+10],
       [2.00015985e+10],
       [2.05610696e+10],
       [2.04848851e+10],
       [1.99071114e+10],
       [1.80177491e+10],
       [1.88699457e+10],
       [1.83538811e+10],
       [1.92911040e+10]])

In [71]:
life = pd.read_csv('life expectancy.csv')
life.head()

Unnamed: 0,Country Name,ISO,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,ABW,75.017,75.158,75.299,75.441,75.583,75.725,75.868,76.01,76.152
1,Afghanistan,AFG,61.028,61.553,62.054,62.525,62.966,63.377,63.763,64.13,64.486
2,Angola,AGO,55.35,56.33,57.236,58.054,58.776,59.398,59.925,60.379,60.782
3,Albania,ALB,76.562,76.914,77.252,77.554,77.813,78.025,78.194,78.333,78.458
4,Andorra,AND,,,,,,,,,


In [72]:
afg_life = life[life['ISO'] == 'AFG'].drop(columns=life.columns[[0,1]]).T
afg_life = np.array(afg_life)
afg_life

array([[61.028],
       [61.553],
       [62.054],
       [62.525],
       [62.966],
       [63.377],
       [63.763],
       [64.13 ],
       [64.486]])

In [73]:
df2 = demo_scaled[['Country of origin (ISO)','Year','total']]
df2.head()

Unnamed: 0,Country of origin (ISO),Year,total
0,AFG,2011,106842.62876
1,ATG,2011,329.496077
2,AZE,2011,61393.243235
3,BDI,2011,29710.300783
4,BFA,2011,62.102088


In [89]:
afg = df2[df2['Country of origin (ISO)'] == 'AFG']
afg.head()


Unnamed: 0,Country of origin (ISO),Year,total
0,AFG,2011,106842.62876
25,AFG,2012,103909.088945
50,AFG,2013,91046.590544
75,AFG,2014,94712.194396
100,AFG,2015,104910.04174


In [90]:
afg['ppp'] = afg_ppp
afg['gdp'] = afg_gdp
afg['life'] = afg_life
afg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Country of origin (ISO),Year,total,ppp,gdp,life
0,AFG,2011,106842.62876,591.162759,17804290000.0,61.028
25,AFG,2012,103909.088945,641.871479,20001600000.0,61.553
50,AFG,2013,91046.590544,637.165523,20561070000.0,62.054
75,AFG,2014,94712.194396,613.856689,20484890000.0,62.525
100,AFG,2015,104910.04174,578.466353,19907110000.0,62.966
125,AFG,2016,116009.60881,509.218661,18017750000.0,63.377
150,AFG,2017,129052.487958,519.884773,18869950000.0,63.763
175,AFG,2018,138742.524858,493.750418,18353880000.0,64.13
200,AFG,2019,150666.475578,507.103432,19291100000.0,64.486


In [91]:
ab_afg = afg.drop(columns=afg.columns[0])
ab_afg

Unnamed: 0,Year,total,ppp,gdp,life
0,2011,106842.62876,591.162759,17804290000.0,61.028
25,2012,103909.088945,641.871479,20001600000.0,61.553
50,2013,91046.590544,637.165523,20561070000.0,62.054
75,2014,94712.194396,613.856689,20484890000.0,62.525
100,2015,104910.04174,578.466353,19907110000.0,62.966
125,2016,116009.60881,509.218661,18017750000.0,63.377
150,2017,129052.487958,519.884773,18869950000.0,63.763
175,2018,138742.524858,493.750418,18353880000.0,64.13
200,2019,150666.475578,507.103432,19291100000.0,64.486


In [92]:
X = ab_afg.drop(columns='total')
model1 = sm.OLS(ab_afg['total'], sm.add_constant(X))

model1 = model1.fit()
model1.summary()

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,total,R-squared:,0.979
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,46.61
Date:,"Thu, 20 May 2021",Prob (F-statistic):,0.00131
Time:,23:04:38,Log-Likelihood:,-84.142
No. Observations:,9,AIC:,178.3
Df Residuals:,4,BIC:,179.3
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.567e+07,1.87e+07,-5.106,0.007,-1.48e+08,-4.36e+07
Year,5.031e+04,1.05e+04,4.790,0.009,2.11e+04,7.95e+04
ppp,394.8072,780.751,0.506,0.640,-1772.906,2562.521
gdp,-1.478e-05,2.39e-05,-0.618,0.570,-8.12e-05,5.16e-05
life,-8.791e+04,4.71e+04,-1.866,0.135,-2.19e+05,4.29e+04

0,1,2,3
Omnibus:,0.388,Durbin-Watson:,2.989
Prob(Omnibus):,0.824,Jarque-Bera (JB):,0.465
Skew:,0.26,Prob(JB):,0.793
Kurtosis:,2.015,Cond. No.,260000000000000.0


ab_afg['is_treated'] = (np.random.rand(9)<0.5).astype(int)
ab_afg['total'] = (ab_afg['is_treated'] * 1.2)*ab_afg['ppp']

model1 = sm.OLS(ab_afg['total'], sm.add_constant(ab_afg['treated']))
model1 = model1.fit()
model1.summary()

In [95]:
X_train,X_test,y_train,y_test = train_test_split(X,ab_afg['total'],random_state=10)

In [93]:
clf = linear_model.Lasso(alpha=0.05)

In [96]:
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)



(0.9830050264213505, 0.8311751049122802)

In [107]:
def myLasso(country):
    df_ppp = PPP[PPP['ISO'] == country].drop(columns=PPP.columns[0]).T
    df_ppp=np.array(df_ppp)
    
    df_life = life[life['ISO'] == country].drop(columns=life.columns[[0,1]]).T
    df_life=np.array(df_life)
    
    df_gdp = GDP[GDP['ISO'] == country].drop(columns=GDP.columns[[0,1]]).T
    df_gdp=np.array(df_gdp)
    
    df = df2[df2['Country of origin (ISO)'] == country]
    df['ppp'] = df_ppp
    df['gdp'] = df_gdp
    df['life'] = df_life
    X_train,X_test,y_train,y_test = train_test_split(X,df['total'],random_state=10)
    clf = linear_model.Lasso(alpha=0.05)
    clf.fit(X_train, y_train)
    return clf.score(X_train, y_train), clf.score(X_test, y_test)

In [108]:
myLasso('AFG')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(0.9830050264213505, 0.8311751049122802)

In [100]:
myLasso('COL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(0.9995848068635492, 0.9401093803739301)