In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

adf = pd.read_csv('alcohol.csv')

This is our database with the alcohol consumption per capita for countries

In [2]:
adf.head()

Unnamed: 0,Entity,Code,Year,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)","GDP per capita, PPP (constant 2017 international $)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,2010,0.21,1957.02907,29185511.0,
2,Afghanistan,AFG,2015,0.21,2068.265904,34413603.0,Asia
3,Afghanistan,AFG,2018,0.21,2033.804389,37171922.0,
4,Afghanistan,AFG,2002,,1189.784668,22600774.0,


We need to change the column called Entity to Country or region so we can merge it with a df we will create later

In [3]:
adf.rename(columns = {'Entity':'Country or region'}, inplace = True)

We will check that it worked

In [4]:
adf.head()

Unnamed: 0,Country or region,Code,Year,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)","GDP per capita, PPP (constant 2017 international $)",Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,2010,0.21,1957.02907,29185511.0,
2,Afghanistan,AFG,2015,0.21,2068.265904,34413603.0,Asia
3,Afghanistan,AFG,2018,0.21,2033.804389,37171922.0,
4,Afghanistan,AFG,2002,,1189.784668,22600774.0,


We only want the collumns for total alcohol consumption, the year, and Country or region so we will make a new dataframe

In [5]:
adf_edit = adf[['Country or region', 'Year', 'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)']]

In [6]:
adf_edit.head()

Unnamed: 0,Country or region,Year,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
0,Abkhazia,2015,
1,Afghanistan,2010,0.21
2,Afghanistan,2015,0.21
3,Afghanistan,2018,0.21
4,Afghanistan,2002,


We need to eliminate all rows with NaN as a value so we will make a new df

In [7]:
adf_edit2=adf_edit.dropna()

In [8]:
adf_edit2.head()

Unnamed: 0,Country or region,Year,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
1,Afghanistan,2010,0.21
2,Afghanistan,2015,0.21
3,Afghanistan,2018,0.21
519,Africa Eastern and Southern,2000,5.014051
520,Africa Eastern and Southern,2005,4.856588


For our sample, we only want data from 2018 so we will again make a new dataframe

In [9]:
adf_2018 = adf_edit2[adf_edit2["Year"] == 2018]

In [10]:
adf_2018.head(10)

Unnamed: 0,Country or region,Year,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
3,Afghanistan,2018,0.21
523,Africa Eastern and Southern,2018,5.170911
554,Africa Western and Central,2018,6.835266
586,Albania,2018,7.17
845,Algeria,2018,0.95
1218,Andorra,2018,11.02
1477,Angola,2018,6.94
1858,Antigua and Barbuda,2018,6.38
2117,Arab World,2018,0.618598
2148,Argentina,2018,9.65


Now I want to remove the year from this data frame as we know all the years = 2018 so there's no need to show it in the table

In [41]:
adf2_2018= adf_2018[['Country or region', 'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)']]

In [43]:
adf2_2018.head(10)

Unnamed: 0,Country or region,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
3,Afghanistan,0.21
523,Africa Eastern and Southern,5.170911
554,Africa Western and Central,6.835266
586,Albania,7.17
845,Algeria,0.95
1218,Andorra,11.02
1477,Angola,6.94
1858,Antigua and Barbuda,6.38
2117,Arab World,0.618598
2148,Argentina,9.65


We are going to order the df by country name

In [44]:
order_adf_2018=adf2_2018.sort_values('Country or region')

In [45]:
order_adf_2018.head()

Unnamed: 0,Country or region,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
3,Afghanistan,0.21
523,Africa Eastern and Southern,5.170911
554,Africa Western and Central,6.835266
586,Albania,7.17
845,Algeria,0.95


We want to calculate the average alcohol consumption

In [46]:
order_adf_2018['Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)'].mean()

5.965732931054613

Now we need to load our second df for 2018 happiness scores

In [47]:
hdf = pd.read_csv('happy_2018.csv')

In [48]:
hdf.head()

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357


We only want the columns for the country and the score so we will make a new df

In [49]:
hdf2= hdf[['Country or region', 'Score']]

In [50]:
hdf2.head()

Unnamed: 0,Country or region,Score
0,Finland,7.632
1,Norway,7.594
2,Denmark,7.555
3,Iceland,7.495
4,Switzerland,7.487


We are going to order the df by country name

In [51]:
order_hdf=hdf2.sort_values('Country or region')

In [52]:
order_hdf.head(10)

Unnamed: 0,Country or region,Score
144,Afghanistan,3.632
111,Albania,4.586
83,Algeria,5.295
141,Angola,3.795
28,Argentina,6.388
128,Armenia,4.321
9,Australia,7.272
11,Austria,7.139
86,Azerbaijan,5.201
42,Bahrain,6.105


We want to calculate the average happyness score

In [53]:
order_hdf['Score'].mean()

5.375916666666666

Our next step is to merge the two df into one

In [54]:
comb_df=pd.merge(order_adf_2018, order_hdf, how='outer', on='Country or region')

In [55]:
comb_df.head()

Unnamed: 0,Country or region,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)",Score
0,Afghanistan,0.21,3.632
1,Africa Eastern and Southern,5.170911,
2,Africa Western and Central,6.835266,
3,Albania,7.17,4.586
4,Algeria,0.95,5.295


Now that they are merged, we need to eliminate any rows without a score and make a new df

In [56]:
final_comb_df= comb_df.dropna()

In [57]:
final_comb_df.head()

Unnamed: 0,Country or region,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)",Score
0,Afghanistan,0.21,3.632
3,Albania,7.17,4.586
4,Algeria,0.95,5.295
6,Angola,6.94,3.795
9,Argentina,9.65,6.388


We want to answer the following question: 
Do countries with a happiness score greater than 7 have an average total alcohol consumption per capita equal to 4?
To answer this we will want to run a two-sided, single sample t-test.
Our hypotheses would be:
Null μ = 4
Alternative μ != 4

So first we need to make a dataframe with countries with happiness scores above 7

In [58]:
df_above_7 = final_comb_df[final_comb_df["Score"] > 7]

In [59]:
df_above_7.head()

Unnamed: 0,Country or region,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)",Score
11,Australia,10.51,7.272
12,Austria,11.96,7.139
33,Canada,8.94,7.328
44,Costa Rica,4.87,7.072
51,Denmark,10.26,7.555


Now we have our sample of scores above 7 from 2018. To run the t-test the only data from the sample we need is the total alcohol consumption. So we will make a new dataframe with only that data

In [60]:
test_df = df_above_7[['Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)']]

In [61]:
test_df.head(14)

Unnamed: 0,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
11,10.51
12,11.96
33,8.94
44,4.87
51,10.26
73,10.78
97,9.12
152,9.61
153,10.63
160,7.41


Now we can run the t-test and get the t-stat. We will be using a critical level/alpha of .05 as that's what's most common to use

In [62]:
stats.ttest_1samp(a=test_df, popmean=4)

Ttest_1sampResult(statistic=array([10.64505808]), pvalue=array([1.81533951e-07]))

The t test statistic is 10.64505808 and the corresponding two-sided p-value is 1.81533951e-07

We need to get the t-critical value and compare it to the t-stat inorder to determine if we accept or reject the null. to get this we first need to determine the degrees of freedom

In [63]:
dof=len(test_df.index)-1

In [64]:
print(dof)

12


So our degrees of freedom is 12

Now we use our chosen alpha level and the degrees of freedom to find the t-critical value

In [65]:
stats.t.ppf(1-.05/2,12)

2.1788128296634177

We can see our t-critical value is 2.179

Our t test statistic is 10.645 is more extreme than our t-critical value of 2.179. This tells us we should reject the null hypothesis. Additionally our p-value of 1.815e-07 is less than our alpha of .05 which also tells us we should reject the null. So we can concluded that countries with an average happiness score greater than 7 don't have an average total alcohol consumption per capita equal to 4.