# World Happiness Linear Regression Project

In [78]:
import csv
import pandas as pd
import numpy as np


In [79]:
df = pd.read_csv('whr_2019_data.csv')

## Data cleaning

In [80]:
df.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,...,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014"
0,Afghanistan,2008,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,0.517637,...,,,,,,,,,,
1,Afghanistan,2009,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,0.583926,...,,,0.441906,0.286315,,,,,,
2,Afghanistan,2010,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,0.618265,...,,,0.327318,0.275833,,,,,,
3,Afghanistan,2011,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,0.611387,...,,,0.336764,,,,,,,
4,Afghanistan,2012,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,0.710385,...,,,0.34454,,,,,,,


In [81]:
df.columns

Index(['Country name', 'Year', 'Life Ladder', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Positive affect', 'Negative affect',
       'Confidence in national government', 'Democratic Quality',
       'Delivery Quality', 'Standard deviation of ladder by country-year',
       'Standard deviation/Mean of ladder by country-year',
       'GINI index (World Bank estimate)',
       'GINI index (World Bank estimate), average 2000-16',
       'gini of household income reported in Gallup, by wp5-year',
       'Most people can be trusted, Gallup',
       'Most people can be trusted, WVS round 1981-1984',
       'Most people can be trusted, WVS round 1989-1993',
       'Most people can be trusted, WVS round 1994-1998',
       'Most people can be trusted, WVS round 1999-2004',
       'Most people can be trusted, WVS round 2005-2009',
       'Most people can be trusted, WV

In [82]:
drop_df = df.drop(columns=['Standard deviation of ladder by country-year', 
           'Standard deviation/Mean of ladder by country-year', 
           'GINI index (World Bank estimate)', 
           'GINI index (World Bank estimate), average 2000-16',
           'gini of household income reported in Gallup, by wp5-year',
           'Most people can be trusted, Gallup',
           'Most people can be trusted, WVS round 1981-1984',
           'Most people can be trusted, WVS round 1989-1993',
           'Most people can be trusted, WVS round 1994-1998',
           'Most people can be trusted, WVS round 1999-2004',
           'Most people can be trusted, WVS round 2005-2009',
           'Most people can be trusted, WVS round 2010-2014',
           'Confidence in national government',
           'Democratic Quality',
           'Delivery Quality'])
drop_df.head()

Unnamed: 0,Country name,Year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,0.517637,0.258195
1,Afghanistan,2009,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,0.583926,0.237092
2,Afghanistan,2010,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,0.618265,0.275324
3,Afghanistan,2011,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,0.611387,0.267175
4,Afghanistan,2012,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,0.710385,0.267919


In [83]:
drop_df.columns = drop_df.columns.str.lower()

In [84]:
drop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 11 columns):
country name                        1704 non-null object
year                                1704 non-null int64
life ladder                         1704 non-null float64
log gdp per capita                  1676 non-null float64
social support                      1691 non-null float64
healthy life expectancy at birth    1676 non-null float64
freedom to make life choices        1675 non-null float64
generosity                          1622 non-null float64
perceptions of corruption           1608 non-null float64
positive affect                     1685 non-null float64
negative affect                     1691 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 146.5+ KB


In [85]:
renamed_df = drop_df.rename(columns = {'country name': 'country', 'life ladder': 'life_ladder', 
                                       'log gdp per capita': 'gdp_per_capita', 
                                       'social support': 'social_support',
                                       'healthy life expectancy at birth': 'life_expectancy', 
                                       'freedom to make life choices': 'freedom', 
                                       'perceptions of corruption': 'corruption',
                                       'positive affect': 'positive_affect', 
                                       'negative affect': 'negative_affect'})

# 'confidence in national government': 'conf_in_govt', 
# 'democratic quality': 'democratic_quality', 
# 'delivery quality': 'delivery_quality'

In [86]:
renamed_df.head()

Unnamed: 0,country,year,life_ladder,gdp_per_capita,social_support,life_expectancy,freedom,generosity,corruption,positive_affect,negative_affect
0,Afghanistan,2008,3.72359,7.16869,0.450662,50.799999,0.718114,0.177889,0.881686,0.517637,0.258195
1,Afghanistan,2009,4.401778,7.33379,0.552308,51.200001,0.678896,0.200178,0.850035,0.583926,0.237092
2,Afghanistan,2010,4.758381,7.386629,0.539075,51.599998,0.600127,0.134353,0.706766,0.618265,0.275324
3,Afghanistan,2011,3.831719,7.415019,0.521104,51.919998,0.495901,0.172137,0.731109,0.611387,0.267175
4,Afghanistan,2012,3.782938,7.517126,0.520637,52.240002,0.530935,0.244273,0.77562,0.710385,0.267919


In [87]:
renamed_df[renamed_df.gdp_per_capita.isnull()]

Unnamed: 0,country,year,life_ladder,gdp_per_capita,social_support,life_expectancy,freedom,generosity,corruption,positive_affect,negative_affect
385,Cyprus,2018,6.276443,,0.825573,73.699997,0.794215,,0.848337,0.750122,0.298021
801,Kosovo,2018,6.391826,,0.822407,65.149826,0.889737,,0.922078,0.778271,0.170248
872,Libya,2018,5.493978,,0.824165,62.299999,0.780559,,0.645839,0.705535,0.398903
960,Malta,2018,6.909711,,0.931542,72.199997,0.927341,,0.5952,0.721224,0.295699
1126,North Cyprus,2012,5.463305,,0.87115,,0.692568,,0.85473,0.709236,0.405435
1127,North Cyprus,2013,5.566803,,0.869274,,0.775383,,0.715356,0.621554,0.442972
1128,North Cyprus,2014,5.785979,,0.801802,,0.829677,,0.692221,0.723842,0.311336
1129,North Cyprus,2015,5.84255,,0.791383,,0.785353,,0.65918,0.701609,0.31893
1130,North Cyprus,2016,5.827128,,0.80769,,0.796234,,0.670191,0.643664,0.346465
1131,North Cyprus,2018,5.608056,,0.837392,,0.797066,,0.613837,0.480453,0.261868


### dropping all NaNs

In [88]:
clean_df = renamed_df.dropna(how = 'any')

In [89]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 0 to 1703
Data columns (total 11 columns):
country            1512 non-null object
year               1512 non-null int64
life_ladder        1512 non-null float64
gdp_per_capita     1512 non-null float64
social_support     1512 non-null float64
life_expectancy    1512 non-null float64
freedom            1512 non-null float64
generosity         1512 non-null float64
corruption         1512 non-null float64
positive_affect    1512 non-null float64
negative_affect    1512 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 141.8+ KB


## web scraping Wikipedia

In [90]:
from __future__ import print_function, division
import requests

requests.__path__

['/anaconda3/envs/metis/lib/python3.6/site-packages/requests']

In [91]:
url = 'https://en.wikipedia.org/wiki/List_of_cities_by_sunshine_duration'

response = requests.get(url)

In [92]:
response.status_code

200

In [93]:
print(response.text)

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of cities by sunshine duration - Wikipedia</title>
<script>document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_by_sunshine_duration","wgTitle":"List of cities by sunshine duration","wgCurRevisionId":899503868,"wgRevisionId":899503868,"wgArticleId":49711392,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 uses Chinese-language script (zh)","CS1 Chinese-language sources (zh)","Webarchive template wayback links","CS1 German-language sources (de)","CS1 Korean-language sources (ko)","Articles with Korean-language external links","CS1 Thai-language sources (th)","CS1 Vietnamese-language sources (vi)","CS1 French-language sources (fr)","CS1 Du

In [94]:
page = response.text

In [95]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page, "html5")

In [96]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of cities by sunshine duration - Wikipedia
  </title>
  <script>
   document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_by_sunshine_duration","wgTitle":"List of cities by sunshine duration","wgCurRevisionId":899503868,"wgRevisionId":899503868,"wgArticleId":49711392,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 uses Chinese-language script (zh)","CS1 Chinese-language sources (zh)","Webarchive template wayback links","CS1 German-language sources (de)","CS1 Korean-language sources (ko)","Articles with Korean-language external links","CS1 Thai-language sources (th)","CS1 Vietnamese-language sources (vi)","CS1 French-language sou

In [97]:
soup.a

<a id="top"></a>

In [98]:
soup.find('a').findNextSibling()

<div class="mw-body-content" id="siteNotice"><!-- CentralNotice --></div>

In [99]:
len(soup.find_all('a'))

1418

In [100]:
for link in soup.find_all('a'): 
    print(link)

<a id="top"></a>
<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>
<a class="mw-jump-link" href="#p-search">Jump to search</a>
<a href="/wiki/Wikipedia:WikiProject_Lists#Incomplete_lists" title="Wikipedia:WikiProject Lists">dynamic list</a>
<a class="external text" href="//en.wikipedia.org/w/index.php?title=List_of_cities_by_sunshine_duration&amp;action=edit">expanding it</a>
<a href="/wiki/Wikipedia:Reliable_sources" title="Wikipedia:Reliable sources">reliably sourced</a>
<a class="external text" href="//en.wikipedia.org/w/index.php?title=List_of_cities_by_sunshine_duration&amp;action=edit">improve it</a>
<a href="/wiki/Talk:List_of_cities_by_sunshine_duration" title="Talk:List of cities by sunshine duration">talk page</a>
<a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove these template messages</a>
<a href="/wiki/Wikipedia:Article_size" title="Wikipedia:Article size">too long</a>
<a href="/wiki/Wikipe

<a class="external text" href="http://www.birao.climatemps.com/sunlight.php" rel="nofollow">"Sunshine &amp; Daylight Hours in Birao, Central African Republic"</a>
<a href="#cite_ref-69">^</a>
<a class="external text" href="http://www.dwd.de/DWD/klima/beratung/ak/ak_643900_kt.pdf" rel="nofollow">"Klimatafel von Bujumbura (Usambara) / Burundi"</a>
<a href="#cite_ref-70">^</a>
<a class="external text" href="ftp://ftp.atdd.noaa.gov/pub/GCOS/WMO-Normals/TABLES/REG__I/GW/61832.TXT" rel="nofollow">"Conakry Climate Normals 1961–1990"</a>
<a href="#cite_ref-71">^</a>
<a class="external text" href="ftp://ftp.atdd.noaa.gov/pub/GCOS/WMO-Normals/TABLES/REG__I/GW/61829.TXT" rel="nofollow">"Kankan Climate Normals 1961–1990"</a>
<a href="#cite_ref-72">^</a>
<a class="external text" href="http://www.ucm.es/info/cif/station/gu-bissa.htm" rel="nofollow">"Bissau Climate Guide"</a>
<a href="#cite_ref-73">^</a>
<a class="external text" href="http://www.dwd.de/DWD/klima/beratung/ak/ak_648200_kt.pdf" rel="nof

In [101]:
[link for link in soup.find_all('a') if 'Ivory Coast' in str(link)]

[<a href="/wiki/Ivory_Coast" title="Ivory Coast">Ivory Coast</a>,
 <a href="/wiki/Ivory_Coast" title="Ivory Coast">Ivory Coast</a>,
 <a href="/wiki/Ivory_Coast" title="Ivory Coast">Ivory Coast</a>,
 <a href="/wiki/Ivory_Coast" title="Ivory Coast">Ivory Coast</a>,
 <a href="/wiki/Ivory_Coast" title="Ivory Coast">Ivory Coast</a>]

In [102]:
chain = [x.find_all('td') for x in soup.find_all(class_='mw-indicators mw-body-content')]

In [103]:
chain[0]

[]

In [104]:
# soup.find(class_='mp_box_content').find_all('td')[1].text

In [105]:
# soup.find(class_='mp_box_content').find_all('td')[1].text[1:] 

## Linear Regression

In [106]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [107]:
lr_df = clean_df.drop(columns = ['country', 'year'])

### Train, test, split

In [108]:
x, y = lr_df.drop('life_ladder', axis=1), lr_df['life_ladder']

In [109]:
x, x_test, y, y_test = train_test_split(x, y, test_size=.2, random_state=10)

In [110]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.25, random_state=3)

In [111]:
lm = LinearRegression()

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train.values)
x_val_scaled = scaler.transform(x_val.values)
x_test_scaled = scaler.transform(x_test.values)

lm_reg = Ridge(alpha=1)


poly = PolynomialFeatures(degree=2) 

x_train_poly = poly.fit_transform(x_train.values)
x_val_poly = poly.transform(x_val.values)
x_test_poly = poly.transform(x_test.values)

lm_poly = LinearRegression()

In [116]:
lm.fit(x_train, y_train)
print(f'Linear Regression val R^2: {lm.score(x_val, y_val):.3f}')

lm_reg.fit(x_train_scaled, y_train)
print(f'Ridge Regression val R^2: {lm_reg.score(x_val_scaled, y_val):.3f}')

lm_poly.fit(x_train_poly, y_train)
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(x_val_poly, y_val):.3f}')

#We can see that a degree 2 polynomial regression seems to be the best fit due to the highest R^2

Linear Regression val R^2: 0.748
Ridge Regression val R^2: 0.748
Degree 2 polynomial regression val R^2: 0.790


### Cross validation

In [112]:
# lr_df.corr()

In [113]:
# sns.pairplot(lr_df)

In [114]:
# lr_df.head()

In [115]:
# lr = LinearRegression()

# x = lr_df.iloc[:, 1:]
# y = lr_df.iloc[:, 0]

# lr.fit(x, y)
# lr.score(x, y)