In [1]:
import re
import time
from json import dumps
import pandas as pd
from sklearn.model_selection import train_test_split 
from kafka import KafkaProducer

In [2]:
# Read all the datasets

df_2015=pd.read_csv("../data/2015.csv")
df_2016=pd.read_csv("../data/2016.csv")
df_2017=pd.read_csv("../data/2017.csv")
df_2018=pd.read_csv("../data/2018.csv")
df_2019=pd.read_csv("../data/2019.csv")

In [3]:
# Dropping unnecesary columns

dfs_2015_and_2016 = [df_2015, df_2016]
for df in dfs_2015_and_2016:
    df.drop(columns={'Region', 'Country', 'Dystopia Residual', 'Generosity', 'Happiness Rank'}, inplace=True)

df_2015 = df_2015.drop('Standard Error', axis=1)

df_2016 = df_2016.drop(['Lower Confidence Interval', 'Upper Confidence Interval'], axis=1)

df_2017 = df_2017.drop(['Country', 'Whisker.low', 'Whisker.high', 'Dystopia.Residual', 'Generosity', 'Happiness.Rank'], axis=1)

dfs_2018_and_2019 = [df_2018, df_2019]
for df in dfs_2018_and_2019:
    df.drop(columns={'Country or region', 'Generosity', 'Overall rank'}, inplace=True)

In [4]:
# Normalizing the columns headers

dfs = [df_2015, df_2016, df_2017, df_2018, df_2019]

def normalize_column_names(df):
    df.columns = [re.sub(r'\(|\)|\.| ', '_', col.lower()).rstrip('_') for col in df.columns]
    df.columns = [re.sub(r'[^a-zA-Z0-9_]', '', col.replace('__', '_')) for col in df.columns]

for df in dfs:
    normalize_column_names(df)

In [5]:
# Rename columns

df_2018 = df_2018.rename(columns={'score': 'happiness_score'})
df_2019 = df_2019.rename(columns={'score': 'happiness_score'})

dfs_2015_to_2017 = [df_2015, df_2016, df_2017]
for df in dfs_2015_to_2017:
    df.rename(columns={'economy_gdp_per_capita': 'gdp_per_capita', 'family': 'social_support', 
                       'freedom': 'freedom_to_make_life_choices', 'trust_government_corruption': 'perceptions_of_corruption',
                       'health_life_expectancy':'healthy_life_expectancy'}, inplace=True)

In [6]:
# Concatenated the dataset

dfs = [df_2015, df_2016, df_2017, df_2018, df_2019]
concatenated_df = pd.concat(dfs, axis=0)

In [12]:
# Dropping the null row

concatenated_df = concatenated_df.dropna(subset=['perceptions_of_corruption'])

In [15]:
# Checking the final dataset

concatenated_df.head(5)

Unnamed: 0,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,perceptions_of_corruption
0,7.587,1.39651,1.34951,0.94143,0.66557,0.41978
1,7.561,1.30232,1.40223,0.94784,0.62877,0.14145
2,7.527,1.32548,1.36058,0.87464,0.64938,0.48357
3,7.522,1.459,1.33095,0.88521,0.66973,0.36503
4,7.427,1.32629,1.32261,0.90563,0.63297,0.32957


In [16]:
# Split the  dependent variable to the independet variables

X = concatenated_df.drop('happiness_score',axis= 1) 
y = concatenated_df['happiness_score'] 

print(f"Independent variables for the model: \n {X.head(5)}") 
print(f"Dependent variable: \n  {y.head(5)}")

Independent variables for the model: 
    gdp_per_capita  social_support  healthy_life_expectancy  \
0         1.39651         1.34951                  0.94143   
1         1.30232         1.40223                  0.94784   
2         1.32548         1.36058                  0.87464   
3         1.45900         1.33095                  0.88521   
4         1.32629         1.32261                  0.90563   

   freedom_to_make_life_choices  perceptions_of_corruption  
0                       0.66557                    0.41978  
1                       0.62877                    0.14145  
2                       0.64938                    0.48357  
3                       0.66973                    0.36503  
4                       0.63297                    0.32957  
Dependent variable: 
  0    7.587
1    7.561
2    7.527
3    7.522
4    7.427
Name: happiness_score, dtype: float64


In [17]:
# Split data between test data and training data

X_train, X_test, y_train, y_test = train_test_split( 
    X, y, test_size=0.3, random_state=101)

In [21]:
# Create only one dataframe to send all the test data (X_test + y_test)

print(f"Shape of y_test:{y_test.shape}")
print(f"Shape of X_test: {X_test.shape}")

X_test["happiness_score"] = y_test
test_data = X_test
print(f"Shape of test_data: {test_data.shape}")

Shape of y_test:(235,)
Shape of X_test: (235, 6)
Shape of test_data: (235, 6)


In [22]:
# Checking the df with the test data to be sent

test_data.head(5)

Unnamed: 0,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,perceptions_of_corruption,happiness_score
116,1.1,0.842,0.785,0.305,0.125,4.548
134,0.8818,0.747,0.61712,0.17288,0.06324,4.194
42,0.74553,1.04356,0.64425,0.57733,0.09472,6.123
100,0.71206,1.07284,0.07566,0.30658,0.0306,4.867
65,0.855,1.23,0.578,0.448,0.023,5.662


In [13]:
# Data producer - Sending the test data

producer = KafkaProducer(
    value_serializer = lambda m: dumps(m).encode('utf-8'),
    bootstrap_servers = ['localhost:9092'],
)

for _, row in test_data.iterrows():
    message = row.to_dict()
    producer.send('test-data', value=message)
    time.sleep(2)