In [50]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [51]:
# Importing the new CSV file
Report = pd.read_csv('combined_happiness_report_cleaned.csv')
Report.head()

Unnamed: 0,HAPPINESS_SCORE,GDP_PER_CAPITA,FAMILY,LIFE_EXPECTANCY,FREEDOM,GENEROSITY,GOVERNMENT_TRUST,DYSTOPIA_RESIDUAL,SOCIAL_SUPPORT
0,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,
1,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,
2,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,
3,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,
4,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,


In [52]:
# Checking for missing values in the DataFrame
Report.isnull().sum()

HAPPINESS_SCORE        0
GDP_PER_CAPITA         0
FAMILY               414
LIFE_EXPECTANCY        0
FREEDOM                0
GENEROSITY             0
GOVERNMENT_TRUST       0
DYSTOPIA_RESIDUAL    276
SOCIAL_SUPPORT       414
dtype: int64

In [53]:
# Print out header values of the dataset
num_df = Report
num_df.head()

Unnamed: 0,HAPPINESS_SCORE,GDP_PER_CAPITA,FAMILY,LIFE_EXPECTANCY,FREEDOM,GENEROSITY,GOVERNMENT_TRUST,DYSTOPIA_RESIDUAL,SOCIAL_SUPPORT
0,7.537,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027,
1,7.522,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707,
2,7.504,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715,
3,7.494,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716,
4,7.469,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182,


In [54]:
# Save the Dataframe to a new CSV file to update data
num_df.to_csv('new_combined_happiness_report_cleaned.csv', index = False)

## Linear Regression to fill in NULL values (FAMILY)

In [56]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('new_combined_happiness_report_cleaned.csv')

# Separate the data into two dataframes
data_missing = data[data['FAMILY'].isna()]
data_complete = data

# Impute missing values in the complete data using the mean of the column
imputer = SimpleImputer()
data_complete = pd.DataFrame(imputer.fit_transform(data_complete), columns=data_complete.columns)

# Split the complete data into training and testing sets, using life expectancy, freedom, generosity, government trust and GDP per capita as predictors
X = data_complete[['LIFE_EXPECTANCY', 'FREEDOM', 'GENEROSITY', 'GOVERNMENT_TRUST', 'GDP_PER_CAPITA']]
y = data_complete['FAMILY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Impute missing values in the data_missing dataframe
data_missing = pd.DataFrame(imputer.transform(data_missing), columns=data_missing.columns)

# Predict the missing values of the 'family' variable
X_missing = data_missing[['LIFE_EXPECTANCY','FREEDOM', 'GENEROSITY', 'GOVERNMENT_TRUST', 'GDP_PER_CAPITA']]
y_missing_pred = regressor.predict(X_missing)

# Replace the missing values in the original dataframe with the predicted values
data.loc[data['FAMILY'].isna(), 'FAMILY'] = y_missing_pred

# Print the new dataset with imputed missing values and predicted values
print(data)

     HAPPINESS_SCORE  GDP_PER_CAPITA    FAMILY  LIFE_EXPECTANCY   FREEDOM  \
0             7.5370        1.616463  1.533524         0.796667  0.635423   
1             7.5220        1.482383  1.551122         0.792566  0.626007   
2             7.5040        1.480633  1.610574         0.833552  0.627163   
3             7.4940        1.564980  1.516912         0.858131  0.620071   
4             7.4690        1.443572  1.540247         0.809158  0.617951   
..               ...             ...       ...              ...       ...   
823           3.4789        0.997549  1.041802         0.494102  0.509089   
824           3.4762        0.457163  0.908063         0.442678  0.509343   
825           3.3123        0.343243  0.845626         0.572383  0.604088   
826           3.2992        0.425564  0.882111         0.375038  0.377405   
827           2.5669        0.300706  0.768374         0.266052  0.000000   

     GENEROSITY  GOVERNMENT_TRUST  DYSTOPIA_RESIDUAL  SOCIAL_SUPPORT  
0   

In [57]:
# Check to ensure no more NULL values for Family
data.isnull().sum()

HAPPINESS_SCORE        0
GDP_PER_CAPITA         0
FAMILY                 0
LIFE_EXPECTANCY        0
FREEDOM                0
GENEROSITY             0
GOVERNMENT_TRUST       0
DYSTOPIA_RESIDUAL    276
SOCIAL_SUPPORT       414
dtype: int64

In [58]:
# Save the dataset to a new CSV file to update data
data.to_csv('new_data.csv', index = False)

## Linear Regression to fill in NULL values (SOCIAL_SUPPORT)

In [59]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('new_data.csv')

# Separate the data into two dataframes
data_missing = data[data['SOCIAL_SUPPORT'].isna()]
data_complete = data

# Impute missing values in the complete data using the mean of the column
imputer = SimpleImputer()
data_complete = pd.DataFrame(imputer.fit_transform(data_complete), columns=data_complete.columns)

# Split the complete data into training and testing sets, using life expectancy, freedom, generosity, government trust and GDP per capita as predictors
X = data_complete[['LIFE_EXPECTANCY', 'FREEDOM', 'GENEROSITY', 'GOVERNMENT_TRUST', 'GDP_PER_CAPITA']]
y = data_complete['SOCIAL_SUPPORT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Impute missing values in the data_missing dataframe
data_missing = pd.DataFrame(imputer.transform(data_missing), columns=data_missing.columns)

# Predict the missing values of the 'social support'
X_missing = data_missing[['LIFE_EXPECTANCY','FREEDOM', 'GENEROSITY', 'GOVERNMENT_TRUST', 'GDP_PER_CAPITA']]
y_missing_pred = regressor.predict(X_missing)

# Replace the missing values in the original dataframe with the predicted values
data.loc[data['SOCIAL_SUPPORT'].isna(), 'SOCIAL_SUPPORT'] = y_missing_pred

# Print the new dataset with imputed missing values and predicted values
print(data)

     HAPPINESS_SCORE  GDP_PER_CAPITA    FAMILY  LIFE_EXPECTANCY   FREEDOM  \
0             7.5370        1.616463  1.533524         0.796667  0.635423   
1             7.5220        1.482383  1.551122         0.792566  0.626007   
2             7.5040        1.480633  1.610574         0.833552  0.627163   
3             7.4940        1.564980  1.516912         0.858131  0.620071   
4             7.4690        1.443572  1.540247         0.809158  0.617951   
..               ...             ...       ...              ...       ...   
823           3.4789        0.997549  1.041802         0.494102  0.509089   
824           3.4762        0.457163  0.908063         0.442678  0.509343   
825           3.3123        0.343243  0.845626         0.572383  0.604088   
826           3.2992        0.425564  0.882111         0.375038  0.377405   
827           2.5669        0.300706  0.768374         0.266052  0.000000   

     GENEROSITY  GOVERNMENT_TRUST  DYSTOPIA_RESIDUAL  SOCIAL_SUPPORT  
0   

In [60]:
# Check to ensure no more NULL values for Social Support
data.isnull().sum()

HAPPINESS_SCORE        0
GDP_PER_CAPITA         0
FAMILY                 0
LIFE_EXPECTANCY        0
FREEDOM                0
GENEROSITY             0
GOVERNMENT_TRUST       0
DYSTOPIA_RESIDUAL    276
SOCIAL_SUPPORT         0
dtype: int64

In [61]:
# Save the dataset to a new CSV file to update data
data.to_csv('v2_dataset.csv', index = False)

## Linear Regression to fill in NULL values (DYSTOPIA_RESIDUAL)

In [62]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('v2_dataset.csv')

# Separate the data into two dataframes
data_missing = data[data['DYSTOPIA_RESIDUAL'].isna()]
data_complete = data

# Impute missing values in the complete data using the mean of the column
imputer = SimpleImputer()
data_complete = pd.DataFrame(imputer.fit_transform(data_complete), columns=data_complete.columns)

# Split the complete data into training and testing sets, using life expectancy, freedom, generosity, government trust and GDP per capita as predictors
X = data_complete[['LIFE_EXPECTANCY', 'FREEDOM', 'GENEROSITY', 'GOVERNMENT_TRUST', 'GDP_PER_CAPITA']]
y = data_complete['DYSTOPIA_RESIDUAL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Impute missing values in the data_missing dataframe
data_missing = pd.DataFrame(imputer.transform(data_missing), columns=data_missing.columns)

# Predict the missing values of the 'dystopia residual' variable
X_missing = data_missing[['LIFE_EXPECTANCY','FREEDOM', 'GENEROSITY', 'GOVERNMENT_TRUST', 'GDP_PER_CAPITA']]
y_missing_pred = regressor.predict(X_missing)

# Replace the missing values in the original dataframe with the predicted values
data.loc[data['DYSTOPIA_RESIDUAL'].isna(), 'DYSTOPIA_RESIDUAL'] = y_missing_pred

# Print the new dataset with imputed missing values and predicted values
print(data)

     HAPPINESS_SCORE  GDP_PER_CAPITA    FAMILY  LIFE_EXPECTANCY   FREEDOM  \
0             7.5370        1.616463  1.533524         0.796667  0.635423   
1             7.5220        1.482383  1.551122         0.792566  0.626007   
2             7.5040        1.480633  1.610574         0.833552  0.627163   
3             7.4940        1.564980  1.516912         0.858131  0.620071   
4             7.4690        1.443572  1.540247         0.809158  0.617951   
..               ...             ...       ...              ...       ...   
823           3.4789        0.997549  1.041802         0.494102  0.509089   
824           3.4762        0.457163  0.908063         0.442678  0.509343   
825           3.3123        0.343243  0.845626         0.572383  0.604088   
826           3.2992        0.425564  0.882111         0.375038  0.377405   
827           2.5669        0.300706  0.768374         0.266052  0.000000   

     GENEROSITY  GOVERNMENT_TRUST  DYSTOPIA_RESIDUAL  SOCIAL_SUPPORT  
0   

In [63]:
# Check to ensure no more NULL values for Dystopia Residual
data.isnull().sum()

HAPPINESS_SCORE      0
GDP_PER_CAPITA       0
FAMILY               0
LIFE_EXPECTANCY      0
FREEDOM              0
GENEROSITY           0
GOVERNMENT_TRUST     0
DYSTOPIA_RESIDUAL    0
SOCIAL_SUPPORT       0
dtype: int64

In [64]:
# Save the dataset to a new CSV file to update data
data.to_csv('final_version_dataset.csv', index = False)