In [2]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [3]:
# Combine the Happiness Report of the 2015-2020 Data

df1 = pd.read_csv('2015_report.csv')
df2 = pd.read_csv('2016_report.csv')
df3 = pd.read_csv('2017_report.csv')
df4 = pd.read_csv('2018_report.csv')
df5 = pd.read_csv('2019_report.csv')
df6 = pd.read_csv('2020_report.csv')

# Combine the datasets by concatenating them vertically
combined_df = pd.concat([df1,df2,df3,df4,df5,df6])

# Save the combined dataset to a new CSV file
combined_df.to_csv('combined_happiness_report_raw.csv', index=False)

In [4]:
Report = pd.read_csv('combined_Happiness_report_raw.csv')
Report.head()

Unnamed: 0,country,happiness_score,gdp_per_capita,family,health,freedom,generosity,government_trust,dystopia_residual,continent,social_support
0,Norway,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,Europe,
1,Denmark,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,Europe,
2,Iceland,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,Europe,
3,Switzerland,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,Europe,
4,Finland,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,Europe,


In [5]:
# Checking for duplicates in the DataFrame and remove them if any
Report.drop_duplicates(inplace = True)

In [6]:
# Create a copy of the Dataset
Report_clean = Report.copy()

# Rename the columns to reflect the description of datapoints
Report_clean.rename(columns = {'health': 'life_expectancy'}, inplace = True)

# Convert all Variable Names to UPPERCASE
Report_clean.columns = Report_clean.columns.str.upper()

# Print the Variable Information to check
Report_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 828 entries, 0 to 827
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   COUNTRY            828 non-null    object 
 1   HAPPINESS_SCORE    828 non-null    float64
 2   GDP_PER_CAPITA     828 non-null    float64
 3   FAMILY             414 non-null    float64
 4   LIFE_EXPECTANCY    828 non-null    float64
 5   FREEDOM            828 non-null    float64
 6   GENEROSITY         828 non-null    float64
 7   GOVERNMENT_TRUST   828 non-null    float64
 8   DYSTOPIA_RESIDUAL  552 non-null    float64
 9   CONTINENT          828 non-null    object 
 10  SOCIAL_SUPPORT     414 non-null    float64
dtypes: float64(9), object(2)
memory usage: 77.6+ KB


In [8]:
# Drop the Country and Continent column which is irrelevant to our analysis
df = Report_clean.drop(['COUNTRY', 'CONTINENT'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 828 entries, 0 to 827
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HAPPINESS_SCORE    828 non-null    float64
 1   GDP_PER_CAPITA     828 non-null    float64
 2   FAMILY             414 non-null    float64
 3   LIFE_EXPECTANCY    828 non-null    float64
 4   FREEDOM            828 non-null    float64
 5   GENEROSITY         828 non-null    float64
 6   GOVERNMENT_TRUST   828 non-null    float64
 7   DYSTOPIA_RESIDUAL  552 non-null    float64
 8   SOCIAL_SUPPORT     414 non-null    float64
dtypes: float64(9)
memory usage: 64.7 KB


In [9]:
df.to_csv('combined_happiness_report_cleaned.csv', index = False)

In [10]:
# Importing the new CSV file
Report = pd.read_csv('combined_happiness_report_cleaned.csv')
Report.head()

Unnamed: 0,HAPPINESS_SCORE,GDP_PER_CAPITA,FAMILY,LIFE_EXPECTANCY,FREEDOM,GENEROSITY,GOVERNMENT_TRUST,DYSTOPIA_RESIDUAL,SOCIAL_SUPPORT
0,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,
1,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,
2,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,
3,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,
4,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,


In [11]:
# Checking for any missing values in the DataFrame
Report.isnull().sum()

HAPPINESS_SCORE        0
GDP_PER_CAPITA         0
FAMILY               414
LIFE_EXPECTANCY        0
FREEDOM                0
GENEROSITY             0
GOVERNMENT_TRUST       0
DYSTOPIA_RESIDUAL    276
SOCIAL_SUPPORT       414
dtype: int64

In [12]:
# Creating a new dataframe for numeric variables
newDF = pd.DataFrame(Report[["HAPPINESS_SCORE", "GDP_PER_CAPITA","FAMILY","SOCIAL_SUPPORT", "LIFE_EXPECTANCY","FREEDOM","GENEROSITY","GOVERNMENT_TRUST","DYSTOPIA_RESIDUAL"]])
# Summary Statistics for all Variables
newDF.describe()

Unnamed: 0,HAPPINESS_SCORE,GDP_PER_CAPITA,FAMILY,SOCIAL_SUPPORT,LIFE_EXPECTANCY,FREEDOM,GENEROSITY,GOVERNMENT_TRUST,DYSTOPIA_RESIDUAL
count,828.0,828.0,414.0,414.0,828.0,828.0,828.0,828.0,552.0
mean,5.456889,0.921591,1.005974,1.212815,0.64307,0.423486,0.20965,0.123755,2.077039
std,1.119641,0.389438,0.314665,0.278733,0.238114,0.147411,0.12032,0.107262,0.550422
min,2.5669,0.0,0.0,0.352428,0.0,0.0,0.0,0.0,0.257241
25%,4.571,0.628311,0.807508,1.0565,0.497565,0.319353,0.120956,0.05298,1.737234
50%,5.4441,0.988765,1.03521,1.258543,0.68052,0.435641,0.193935,0.088507,2.092675
75%,6.2837,1.229108,1.245378,1.4405,0.816625,0.541275,0.270015,0.15121,2.432253
max,7.8087,2.096,1.610574,1.644,1.141,0.724,0.838075,0.55191,3.60214
