# Using Unsupervised Learning to find clusters in 2019 World Happiness Report

## Load Data from AWS


In [200]:
# Import libraries
# Initial imports
import psycopg2
import sys
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas
import re
from sqlalchemy import create_engine
import sqlalchemy

In [201]:
# Connection parameters
import ic_config
param_dic = {
    "host"      : ic_config.host,
    "database"  : ic_config.database,
    "user"      : ic_config.username,
    "password"  : ic_config.password
}

# Create connect function to connect to PostgresSQL server
def connect(param_dic):
    conn = None
    try:
        print("Connecting to the PostgreSQL database...")
        conn = psycopg2.connect(**param_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1)
    print ("Connection successful.")
    return conn
    
    

In [202]:
# Function to get the dataframe
def postgresql_to_dataframe(conn, select_query, column_names):
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
            print("Error: %s" % error)
            cursor.close()
    
    # Get list of tuples
    tuples = cursor.fetchall()
    cursor.close()
    
    # Create pandas dataframe
    df = pd.DataFrame(tuples, columns=column_names)
    return df

In [242]:
# Connect to the database
conn = connect(param_dic)

column_names = ["country", "happinessrank", "happinessscore", "gdp", "family", "lifeexpectancy", "freedom", "generosity", "trust", "lat", "lng", "alcohol_liperyear"]
df = postgresql_to_dataframe(conn, "select * from whr_2019", column_names)
df.head()

Connecting to the PostgreSQL database...
Connection successful.


Unnamed: 0,country,happinessrank,happinessscore,gdp,family,lifeexpectancy,freedom,generosity,trust,lat,lng,alcohol_liperyear
0,Brazil,32,6.3,1.004,1.439,0.802,0.39,0.099,0.086,-14.235004,-51.92528,19.3
1,Uruguay,33,6.293,1.124,1.465,0.891,0.523,0.127,0.15,-32.522779,-55.765835,15.7
2,Singapore,34,6.262,1.572,1.463,1.141,0.556,0.271,0.453,1.352083,103.819836,2.9
3,El Salvador,35,6.253,0.794,1.242,0.789,0.43,0.093,0.074,13.794185,-88.89653,12.8
4,Italy,36,6.223,1.294,1.488,1.039,0.231,0.158,0.03,41.87194,12.56738,12.0


## Clean Data

In [204]:
# What data is missing 
# Unsupervised learning models can't handle missing data 
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column country has 0 null values
Column happinessrank has 0 null values
Column happinessscore has 0 null values
Column gdp has 0 null values
Column family has 0 null values
Column lifeexpectancy has 0 null values
Column freedom has 0 null values
Column generosity has 0 null values
Column trust has 0 null values
Column lat has 0 null values
Column lng has 0 null values
Column alcohol_liperyear has 5 null values


In [239]:
# Since there are only 5 countries, drop countries with null values
df = df.dropna()
df_clean = df[df.isna().any(axis=1)]
df.head()


Unnamed: 0,country,happinessrank,happinessscore,gdp,family,lifeexpectancy,freedom,generosity,trust,lat,lng,alcohol_liperyear
0,Brazil,32,6.3,1.004,1.439,0.802,0.39,0.099,0.086,-14.235004,-51.92528,19.3
1,Uruguay,33,6.293,1.124,1.465,0.891,0.523,0.127,0.15,-32.522779,-55.765835,15.7
2,Singapore,34,6.262,1.572,1.463,1.141,0.556,0.271,0.453,1.352083,103.819836,2.9
3,El Salvador,35,6.253,0.794,1.242,0.789,0.43,0.093,0.074,13.794185,-88.89653,12.8
4,Italy,36,6.223,1.294,1.488,1.039,0.231,0.158,0.03,41.87194,12.56738,12.0


In [206]:
#Export country, latitude and longitude data to an excel file for future use in this code
df2 = df[["country","lat","lng"]]
df2.to_excel("country_lat_lng.xlsx")

In [207]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column country has 0 null values
Column happinessrank has 0 null values
Column happinessscore has 0 null values
Column gdp has 0 null values
Column family has 0 null values
Column lifeexpectancy has 0 null values
Column freedom has 0 null values
Column generosity has 0 null values
Column trust has 0 null values
Column lat has 0 null values
Column lng has 0 null values
Column alcohol_liperyear has 0 null values


In [208]:
# Remove columns
df.drop(columns=["country","happinessrank","happinessscore", "lat", "lng"], inplace=True)
df.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0


In [209]:
# Check data types
df.dtypes

gdp                  object
family               object
lifeexpectancy       object
freedom              object
generosity           object
trust                object
alcohol_liperyear    object
dtype: object

In [210]:
# Change data types to float
df = df[0:153].astype(float, errors = 'raise')
df.tail()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
148,0.673,0.799,0.508,0.372,0.105,0.093,15.1
149,0.094,1.125,0.357,0.269,0.212,0.053,7.1
150,0.569,0.808,0.232,0.352,0.154,0.09,21.7
151,0.71,1.181,0.555,0.525,0.566,0.172,17.6
153,1.231,1.477,0.713,0.489,0.185,0.016,14.2


In [211]:
# Check data types and ensure they are all float
df.dtypes

gdp                  float64
family               float64
lifeexpectancy       float64
freedom              float64
generosity           float64
trust                float64
alcohol_liperyear    float64
dtype: object

## Export data to CSV and create DataFrame from cleaned CSV## 

In [212]:
# Save DataFrame on a new csv file for future use 
output_file_path = "../Resources/UML_2019.csv"
df.to_csv(output_file_path, index=False)

In [213]:
# Clustering Exercise
# Loading UML_2019.csv file 
file_path = "../Resources/UML_2019.csv"
df_2019 = pd.read_csv(file_path)
df_2019

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
0,1.004,1.439,0.802,0.390,0.099,0.086,19.3
1,1.124,1.465,0.891,0.523,0.127,0.150,15.7
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9
3,0.794,1.242,0.789,0.430,0.093,0.074,12.8
4,1.294,1.488,1.039,0.231,0.158,0.030,12.0
...,...,...,...,...,...,...,...
144,0.673,0.799,0.508,0.372,0.105,0.093,15.1
145,0.094,1.125,0.357,0.269,0.212,0.053,7.1
146,0.569,0.808,0.232,0.352,0.154,0.090,21.7
147,0.710,1.181,0.555,0.525,0.566,0.172,17.6


In [214]:
# Remove columns
df_2019.drop(columns=["freedom", "generosity","trust", "alcohol_liperyear"], inplace=True)
df_2019.head()

Unnamed: 0,gdp,family,lifeexpectancy
0,1.004,1.439,0.802
1,1.124,1.465,0.891
2,1.572,1.463,1.141
3,0.794,1.242,0.789
4,1.294,1.488,1.039


## K-means Algorithm


In [215]:
# Initializing model K=5 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=5, random_state=8)
model

KMeans(n_clusters=5, random_state=8)

In [216]:
# Fitting model
model.fit(df_2019)

KMeans(n_clusters=5, random_state=8)

In [217]:
# Get the predictions
predictions = model.predict(df_2019)
print(predictions)

[0 0 3 2 3 3 3 0 2 0 0 3 2 0 0 3 2 3 0 0 3 3 0 0 3 2 0 2 0 0 2 3 1 0 2 0 2
 0 0 1 0 0 2 0 0 0 0 0 1 2 0 2 2 2 2 2 2 2 2 2 4 2 2 2 2 0 1 1 4 2 4 1 2 2
 1 2 1 1 1 1 2 1 2 1 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 0 3 0 2 3
 3 1 0 1 1 3 0 4 2 4 2 1 1 1 2 1 4 1 4 4 1 1 4 1 4 2 4 4 1 4 1 4 4 1 1 1 2
 0]


In [218]:
# Add a new class column to the df_iris
df_2019["class"] = model.labels_
df_2019.head()

Unnamed: 0,gdp,family,lifeexpectancy,class
0,1.004,1.439,0.802,0
1,1.124,1.465,0.891,0
2,1.572,1.463,1.141,3
3,0.794,1.242,0.789,2
4,1.294,1.488,1.039,3


In [219]:
# Create a scatterplot of df_2019
df_2019.hvplot.scatter(x="gdp", y="lifeexpectancy", by="class")

In [220]:
# Plotting the clusters with three features
fig = px.scatter_3d(df_2019, x="gdp", y="family", z="lifeexpectancy", color="class", symbol="class", size="lifeexpectancy",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

## Trial and Error

In [221]:
# 2D Scatter plot
df_2019.hvplot.scatter(x="gdp", y="family")

In [222]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_2019
    df["class"] = model.labels_

In [223]:
df_2019.head()

Unnamed: 0,gdp,family,lifeexpectancy,class
0,1.004,1.439,0.802,0
1,1.124,1.465,0.891,0
2,1.572,1.463,1.141,3
3,0.794,1.242,0.789,2
4,1.294,1.488,1.039,3


In [224]:
# Take a DataFrame and the number of clusters to make as arguments.
test_cluster_amount(df_2019, 5)
df_2019.hvplot.scatter(x="gdp", y="lifeexpectancy", by="class")

In [225]:
# Plot the DataFrame with a third axis  
fig = px.scatter_3d(
    df_2019,
x="gdp",
    y="lifeexpectancy",
    z="family",
color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

## Elbow Curve

In [226]:
# Loading UML_2019.csv file 
file_path = "../Resources/UML_2019.csv"
df_ElbowCurve = pd.read_csv(file_path)
df_ElbowCurve

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
0,1.004,1.439,0.802,0.390,0.099,0.086,19.3
1,1.124,1.465,0.891,0.523,0.127,0.150,15.7
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9
3,0.794,1.242,0.789,0.430,0.093,0.074,12.8
4,1.294,1.488,1.039,0.231,0.158,0.030,12.0
...,...,...,...,...,...,...,...
144,0.673,0.799,0.508,0.372,0.105,0.093,15.1
145,0.094,1.125,0.357,0.269,0.212,0.053,7.1
146,0.569,0.808,0.232,0.352,0.154,0.090,21.7
147,0.710,1.181,0.555,0.525,0.566,0.172,17.6


In [227]:
# Remove columns
df_ElbowCurve.drop(columns=["freedom", "generosity","trust", "alcohol_liperyear"], inplace=True)
df_ElbowCurve.head()

Unnamed: 0,gdp,family,lifeexpectancy
0,1.004,1.439,0.802
1,1.124,1.465,0.891
2,1.572,1.463,1.141
3,0.794,1.242,0.789
4,1.294,1.488,1.039


In [228]:
# Create an empty list to hold inertia values
# store a range of K values we want to test
inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(df_ElbowCurve)
   inertia.append(km.inertia_)



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [229]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [230]:
def get_clusters(k, data):
   # Create a copy of the DataFrame
   data = data.copy()

   # Initialize the K-Means model
   model = KMeans(n_clusters=k, random_state=0)

   # Fit the model
   model.fit(data)

   # Predict clusters
   predictions = model.predict(data)

   # Create return DataFrame with predicted clusters
   data["class"] = model.labels_

   return data

In [231]:
# Run the function for k=2
two_clusters = get_clusters(2, df_ElbowCurve)
two_clusters.head()

Unnamed: 0,gdp,family,lifeexpectancy,class
0,1.004,1.439,0.802,0
1,1.124,1.465,0.891,0
2,1.572,1.463,1.141,0
3,0.794,1.242,0.789,0
4,1.294,1.488,1.039,0


In [232]:
# Run the function for k=3
three_clusters = get_clusters(3, df_ElbowCurve)
three_clusters

Unnamed: 0,gdp,family,lifeexpectancy,class
0,1.004,1.439,0.802,2
1,1.124,1.465,0.891,1
2,1.572,1.463,1.141,1
3,0.794,1.242,0.789,2
4,1.294,1.488,1.039,1
...,...,...,...,...
144,0.673,0.799,0.508,0
145,0.094,1.125,0.357,0
146,0.569,0.808,0.232,0
147,0.710,1.181,0.555,2


In [233]:
##Import country, longitude and latitude data
country_lat_lng = pd.read_excel("country_lat_lng.xlsx")
country_lat_lng

Unnamed: 0.1,Unnamed: 0,country,lat,lng
0,0,Brazil,-14.235004,-51.925280
1,1,Uruguay,-32.522779,-55.765835
2,2,Singapore,1.352083,103.819836
3,3,El Salvador,13.794185,-88.896530
4,4,Italy,41.871940,12.567380
...,...,...,...,...
144,148,Congo [Republic],-0.228021,15.827659
145,149,Congo [DRC],-4.038333,21.758664
146,150,Côte d'Ivoire,7.539989,-5.547080
147,151,Myanmar [Burma],21.913965,95.956223


In [234]:
#drop "Unnamed: 0" column
country_lat_lng.drop(columns=["Unnamed: 0"], inplace=True)
country_lat_lng

Unnamed: 0,country,lat,lng
0,Brazil,-14.235004,-51.925280
1,Uruguay,-32.522779,-55.765835
2,Singapore,1.352083,103.819836
3,El Salvador,13.794185,-88.896530
4,Italy,41.871940,12.567380
...,...,...,...
144,Congo [Republic],-0.228021,15.827659
145,Congo [DRC],-4.038333,21.758664
146,Côte d'Ivoire,7.539989,-5.547080
147,Myanmar [Burma],21.913965,95.956223


In [235]:
# add country data to the machine learning module
three_clusters["country"] = country_lat_lng["country"]

In [236]:
# Merge latitudes and longitudes to the machine learning module
three_clusters = pd.merge(three_clusters,country_lat_lng, on = "country")
three_clusters

Unnamed: 0,gdp,family,lifeexpectancy,class,country,lat,lng
0,1.004,1.439,0.802,2,Brazil,-14.235004,-51.925280
1,1.124,1.465,0.891,1,Uruguay,-32.522779,-55.765835
2,1.572,1.463,1.141,1,Singapore,1.352083,103.819836
3,0.794,1.242,0.789,2,El Salvador,13.794185,-88.896530
4,1.294,1.488,1.039,1,Italy,41.871940,12.567380
...,...,...,...,...,...,...,...
144,0.673,0.799,0.508,0,Congo [Republic],-0.228021,15.827659
145,0.094,1.125,0.357,0,Congo [DRC],-4.038333,21.758664
146,0.569,0.808,0.232,0,Côte d'Ivoire,7.539989,-5.547080
147,0.710,1.181,0.555,2,Myanmar [Burma],21.913965,95.956223


In [238]:
## export the result of the machine learning module to our SQL database
from config import db_password
"postgres://[user]:[password]@[location]:[port]/[database]"
db_string = f"postgresql://postgres:{db_password}@whr.csnc4l4qvlqd.us-east-2.rds.amazonaws.com/postgres"
engine = create_engine(db_string)
three_clusters.to_sql(name='three_clusters', con=engine, if_exists = "replace")

In [141]:
# Plotting the 2D-Scatter with x="gdp" and y="trust"
two_clusters.hvplot.scatter(x="gdp", y="family", by="class")

In [142]:
# Plot the 3D-scatter with x="gdp" and y="family" and z="lifeexpectancy"
fig = px.scatter_3d(
    two_clusters,
    x="gdp",
    y="family",
    z="lifeexpectancy",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [143]:
# Plotting the 2D-Scatter with x="gdp" and y="trust"
three_clusters.hvplot.scatter(x="gdp", y="family", by="class")

In [144]:
# Plot the 3D-scatter with x="gdp" and y="family" and z="lifeexpectancy"
fig = px.scatter_3d(
    three_clusters,
    x="gdp",
    y="family",
    z="lifeexpectancy",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


### Description of preliminary data preprocessing
  * The first step in the preliminary data preprocessing connect to PostgresSQL server and retrieve the DataFrame, then connect to our AWS database and create a DataFrame.
Clean the DataFrame that will be used for Machine Learning
  * The next step was to clean the data. The first decision was to determine how Null Values in the database would be handled. A for loop along with the isnull() function were used. Within the DataFrame only 5 null values were found and dropped from the DataFrame, since minimal impact would occur if those rows were dropped. 
  * Cleaning the DataFrame also involved, dropping columns that cannot be used for machine learning ("country", "happinessrank","happinessscore", "lat", and  "lng") and converting the remaining columns’ datatype to float64.
  * Once the DataFrame was cleaned, it was exported as a CSV file, UML_2019.csv. This CSV was used to run the Machine Learning.
### Description of preliminary feature engineering and preliminary feature selection, including the decision-making process
  * For the Unsupervised Machine Learning process, three models were run: K-means Algorithm, Trial and Error of the K-means Algorithm, and the Elbow Curve. 
### Description of how data was split into training and testing sets
  * For the purposes of Unsupervised learning, the data was not split into training and testing sets.
### Explanation of model choice, including limitations and benefits
  * The models displayed very similar clustering results when x=gdp and y=alcohol. However, because the Elbow Curve determines the optimal number of clusters (3), the decision was made to go with this model.