In [36]:
import numpy as np 
import pandas as pd 
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error
from subprocess import check_output
import statsmodels.formula.api as stats
from statsmodels.formula.api import ols

In [2]:
d2015=pd.read_csv("./dataset/2015.csv")
d2016=pd.read_csv("./dataset/2016.csv")
d2017=pd.read_csv("./dataset/2017.csv")


In [10]:
#Happiness in the world 2015
data = dict(type = 'choropleth', locations = d2015['Country'], locationmode = 'country names', z = d2015['Happiness Rank'], text = d2015['Country'], colorscale = 'Viridis', reversescale = False)
layout = dict(title = 'Global Happiness 2015',  geo = dict(showframe = False,  projection = {'type': 'orthographic'}))
choromap = go.Figure(data = [data], layout=layout)
iplot(choromap)

In [22]:
#Happiness in the world 2016
data = dict(type = 'choropleth', locations = d2016['Country'], locationmode = 'country names', z = d2016['Happiness Rank'], text = d2016['Country'], colorscale = 'Viridis', reversescale = False)
layout = dict(title = 'Global Happiness 2016',  geo = dict(showframe = False,  projection = {'type': 'mollweide'}))
choromap = go.Figure(data = [data], layout=layout)
iplot(choromap)

In [52]:
#Happiness in the world 2017
data = dict(type = 'choropleth', locations = d2017['Country'], locationmode = 'country names', z = d2017['Happiness.Rank'], text = d2017['Country'], colorscale = 'Viridis', reversescale = False)
layout = dict(title = 'Global Happiness 2017',  geo = dict(showframe = False,  projection = {'type': 'mollweide'}))
choromap = go.Figure(data = [data], layout=layout)
iplot(choromap)

In [23]:
d2015.columns = ['Country', 'Region', 'Happiness_Rank', 'Happiness_Score',
       'Standard Error', 'Economy', 'Family',
       'Health', 'Freedom', 'Trust',
       'Generosity', 'Dystopia_Residual']

In [24]:
columns_2015 = ['Region', 'Standard Error']
new_dropped_2015 = d2015.drop(columns_2015, axis=1)

In [25]:
columns_2016 = ['Region', 'Lower Confidence Interval','Upper Confidence Interval' ]
dropped_2016 = d2016.drop(columns_2016, axis=1)
dropped_2016.columns = ['Country', 'Happiness_Rank', 'Happiness_Score','Economy', 'Family',
       'Health', 'Freedom', 'Trust',
       'Generosity', 'Dystopia_Residual']

In [26]:
columns_2017 = ['Whisker.high','Whisker.low' ]
dropped_2017 = d2017.drop(columns_2017, axis=1)
dropped_2017.columns = ['Country', 'Happiness_Rank', 'Happiness_Score','Economy', 'Family',
       'Health', 'Freedom', 'Trust',
       'Generosity', 'Dystopia_Residual']

In [27]:
frames = [new_dropped_2015, dropped_2016, dropped_2017]
happiness = pd.concat(frames)

In [28]:
happiness.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [29]:
happiness.describe()

Unnamed: 0,Happiness_Rank,Happiness_Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
count,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0,470.0
mean,78.829787,5.370728,0.92783,0.990347,0.579968,0.402828,0.175605,0.201426,2.092717
std,45.281408,1.136998,0.415584,0.318707,0.240161,0.150356,0.131909,0.133211,0.565772
min,1.0,2.693,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,40.0,4.509,0.605292,0.793,0.402301,0.297615,0.075792,0.098303,1.737975
50%,79.0,5.2825,0.995439,1.025665,0.630053,0.418347,0.139081,0.181624,2.09464
75%,118.0,6.23375,1.252443,1.228745,0.768298,0.51685,0.249839,0.275505,2.455575
max,158.0,7.587,1.870766,1.610574,1.02525,0.66973,0.838075,0.81971,3.83772


In [37]:
dataMap = dict(type = 'choropleth', 
           locations = happiness['Country'],
           locationmode = 'country names',
           z = happiness['Happiness_Rank'], 
           text = happiness['Country'],
          colorscale = 'Viridis', reversescale = False)
layout = dict(title = 'Happiness Rank Across the World', 
             geo = dict(showframe = False, 
                       projection = {'type': 'mercator'}))
choroMap = go.Figure(data = [dataMap], layout=layout)
iplot(choroMap)

In [38]:
scoreMap = dict(type = 'choropleth', 
           locations = happiness['Country'],
           locationmode = 'country names',
           z = happiness['Happiness_Score'], 
           text = happiness['Country'],
           colorbar = {'title':'Happiness'})
layout = dict(title = 'Happiness Score Across the World', 
             geo = dict(showframe = False, 
                       projection = {'type': 'mercator'}))
choroScoreMap = go.Figure(data = [scoreMap], layout=layout)
iplot(choroScoreMap)

In [39]:
traceSR = go.Scatter(
    x = happiness.Happiness_Score,
    y = happiness.Happiness_Rank,
    mode = 'markers'
)
dataSR = [traceSR]
layout = go.Layout(
    title='Happiness Rank Determined by Score',
    xaxis=dict(
        title='Happiness Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Happiness Rank',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

mapSR = go.Figure(data=dataSR, layout=layout)
iplot(mapSR)

In [40]:
drop_rank = happiness.drop("Happiness_Rank", axis = 1)

In [41]:
corr_matrix_happy = drop_rank.corr()
trace_corr_happy = go.Heatmap(z=np.array(corr_matrix_happy), x=corr_matrix_happy.columns, y=corr_matrix_happy.columns)
data_happy=[trace_corr_happy]
iplot(data_happy)

In [42]:
dropped_happy = happiness.drop(["Country", "Happiness_Rank"], axis=1)
dropped_happy.head()

Unnamed: 0,Happiness_Score,Economy,Family,Health,Freedom,Trust,Generosity,Dystopia_Residual
0,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [43]:
#http://bigdata-madesimple.com/how-to-run-linear-regression-in-python-scikit-learn/
from sklearn.linear_model import LinearRegression
X = dropped_happy.drop("Happiness_Score", axis = 1)
lm = LinearRegression()
lm.fit(X, dropped_happy.Happiness_Score)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [45]:
print("Estimated Intercept is", lm.intercept_)
print("The number of coefficients in this model are", lm.coef_)

Estimated Intercept is 0.0001289623008355889
The number of coefficients in this model are [1.00004071 1.00000537 0.99986919 0.99991212 1.00001962 1.00000574
 0.99997241]


In [46]:
coef = zip(X.columns, lm.coef_)
coef_df = pd.DataFrame(list(zip(X.columns, lm.coef_)), columns=['features', 'coefficients'])
coef_df

Unnamed: 0,features,coefficients
0,Economy,1.000041
1,Family,1.000005
2,Health,0.999869
3,Freedom,0.999912
4,Trust,1.00002
5,Generosity,1.000006
6,Dystopia_Residual,0.999972


In [47]:
lm.predict(X)[0:100]

array([7.58691191, 7.560861  , 7.52704145, 7.522182  , 7.42689742,
       7.40604306, 7.37811   , 7.36365823, 7.28604061, 7.28401816,
       7.27761792, 7.22569218, 7.1998147 , 7.18723216, 7.11947789,
       6.98253559, 6.94621954, 6.94025306, 6.93734953, 6.90135796,
       6.86723098, 6.8529541 , 6.81017073, 6.79776404, 6.78621149,
       6.75014685, 6.6696396 , 6.61131216, 6.57505944, 6.57402014,
       6.50503944, 6.48514157, 6.47675891, 6.45466954, 6.41109998,
       6.32891442, 6.30244615, 6.29810684, 6.29478909, 6.26925937,
       6.16769458, 6.13012072, 6.12278768, 6.00281151, 5.99505193,
       5.98709189, 5.98370068, 5.97522753, 5.95968072, 5.94813443,
       5.88974598, 5.8889447 , 5.87834627, 5.85521144, 5.84770394,
       5.83259087, 5.82832455, 5.82408932, 5.81252356, 5.79062338,
       5.77043115, 5.75851629, 5.75438789, 5.71584   , 5.70887154,
       5.69532019, 5.68871723, 5.60460384, 5.58855787, 5.54819768,
       5.47706824, 5.47388764, 5.42860147, 5.39872564, 5.36031

In [48]:
trace = go.Scatter(
    x = lm.predict(X),
    y = dropped_happy.Happiness_Score,
    mode = 'lines+markers'
)
data = [trace]
layout = go.Layout(
    title='Happiness Score vs. Predicted Happiness Score',
    xaxis=dict(
        title='Happiness Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Predicted Happiness Score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [49]:
msehappy = np.mean((dropped_happy.Happiness_Score - lm.predict(X)) ** 2 ) 
print(msehappy)

8.18694450918229e-08


In [50]:
lm2=LinearRegression()
lm2.fit(X[['Family']], dropped_happy.Happiness_Score)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [51]:
msefamily = np.mean((dropped_happy.Happiness_Score - lm2.predict(X[['Family']])) **2)
print(msefamily)

0.7673353072501177
