# Using Unsupervised Learning to find clusters in 2019 World Happiness Report

### Load Data from AWS

In [10]:
# Import libraries
# Initial imports
import psycopg2
import sys
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [11]:
# Connection parameters
param_dic = {
    "host"      : "whr.csnc4l4qvlqd.us-east-2.rds.amazonaws.com",
    "database"  : "postgres",
    "user"      : "postgres",
    "password"  : "UCBwhr2021"
}

# Create connect function to connect to PostgresSQL server
def connect(param_dic):
    conn = None
    try:
        print("Connecting to the PostgreSQL database...")
        conn = psycopg2.connect(**param_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1)
    print ("Connection successful.")
    return conn
    

In [12]:
# Function to get the dataframe
def postgresql_to_dataframe(conn, select_query, column_names):
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
            print("Error: %s" % error)
            cursor.close()
    
    # Get list of tuples
    tuples = cursor.fetchall()
    cursor.close()
    
    # Create pandas dataframe
    df = pd.DataFrame(tuples, columns=column_names)
    return df

In [13]:
# Connect to database
conn = connect(param_dic)

column_names = ["country", "happinessrank", "happinessscore", "gdp", "family", "lifeexpectancy", "freedom", "generosity", "trust", "lat", "lng", "alcohol_liperyear"]
df = postgresql_to_dataframe(conn, "select * from whr_2019", column_names)
df.head()

Connecting to the PostgreSQL database...
Connection successful.


Unnamed: 0,country,happinessrank,happinessscore,gdp,family,lifeexpectancy,freedom,generosity,trust,lat,lng,alcohol_liperyear
0,Brazil,32,6.3,1.004,1.439,0.802,0.39,0.099,0.086,-14.235004,-51.92528,19.3
1,Uruguay,33,6.293,1.124,1.465,0.891,0.523,0.127,0.15,-32.522779,-55.765835,15.7
2,Singapore,34,6.262,1.572,1.463,1.141,0.556,0.271,0.453,1.352083,103.819836,2.9
3,El Salvador,35,6.253,0.794,1.242,0.789,0.43,0.093,0.074,13.794185,-88.89653,12.8
4,Italy,36,6.223,1.294,1.488,1.039,0.231,0.158,0.03,41.87194,12.56738,12.0


### Clean data

In [14]:
# What data is missing 
# Unsupervised learning models can't handle missing data 
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column country has 0 null values
Column happinessrank has 0 null values
Column happinessscore has 0 null values
Column gdp has 0 null values
Column family has 0 null values
Column lifeexpectancy has 0 null values
Column freedom has 0 null values
Column generosity has 0 null values
Column trust has 0 null values
Column lat has 0 null values
Column lng has 0 null values
Column alcohol_liperyear has 5 null values


In [15]:
# Since there are only 5 countries, drop countries with null values
df = df.dropna()
df_clean = df[df.isna().any(axis=1)]
df_clean

Unnamed: 0,country,happinessrank,happinessscore,gdp,family,lifeexpectancy,freedom,generosity,trust,lat,lng,alcohol_liperyear


In [17]:
# Remove columns
df.drop(columns=["country", "happinessrank","happinessscore", "lat", "lng"], inplace=True)
df.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0


In [19]:
# Check data types
df.dtypes

gdp                  object
family               object
lifeexpectancy       object
freedom              object
generosity           object
trust                object
alcohol_liperyear    object
dtype: object

In [20]:
# Change data types to float 
df = df[0:153].astype(float, errors = 'raise')
df.tail()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
148,0.673,0.799,0.508,0.372,0.105,0.093,15.1
149,0.094,1.125,0.357,0.269,0.212,0.053,7.1
150,0.569,0.808,0.232,0.352,0.154,0.09,21.7
151,0.71,1.181,0.555,0.525,0.566,0.172,17.6
153,1.231,1.477,0.713,0.489,0.185,0.016,14.2


In [21]:
# Check data types and ensure they are all float
df.dtypes

gdp                  float64
family               float64
lifeexpectancy       float64
freedom              float64
generosity           float64
trust                float64
alcohol_liperyear    float64
dtype: object

## Export data to CSV and create DataFrame from cleaned CSV

In [23]:
# Save DataFrame on a new csv file for future use 
output_file_path = "../Resources/UML_2019.csv"
df.to_csv(output_file_path, index=False)

In [24]:
# Clustering Exercise
# Loading whr_2019.csv file 
file_path = "../Resources/UML_2019.csv"
df_2019 = pd.read_csv(file_path)
df_2019.head(10)

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0
5,1.362,1.368,0.871,0.536,0.255,0.11,22.1
6,1.246,1.504,0.881,0.334,0.121,0.014,16.6
7,1.206,1.438,0.884,0.483,0.117,0.05,17.1
8,0.745,1.529,0.756,0.631,0.322,0.24,12.8
9,1.238,1.515,0.818,0.291,0.043,0.042,18.9


### K-means Algoritm

In [25]:
# Initializing model K=5 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=5, random_state=8)
model

KMeans(n_clusters=5, random_state=8)

In [26]:
# Fitting model
model.fit(df_2019)

KMeans(n_clusters=5, random_state=8)

In [27]:
# Get the predictions
predictions = model.predict(df_2019)
print(predictions)

[1 1 2 0 0 4 1 1 0 1 1 1 1 0 1 1 0 2 1 1 1 1 0 0 0 0 1 0 1 1 0 1 4 1 1 1 4
 4 0 1 0 1 0 3 2 1 1 4 4 4 1 3 4 2 4 2 0 4 2 4 0 1 4 3 0 0 4 2 2 3 2 4 4 3
 2 3 0 0 2 0 3 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 1 1 1 0 1 1 0 1 1 0 0 2
 2 4 1 0 2 0 1 1 0 2 0 0 3 4 0 1 0 0 1 4 0 3 4 4 1 4 1 1 0 4 4 3 0 0 2 4 1
 0]


In [28]:
# Add a new class column to the df_iris
df_2019["class"] = model.labels_
df_2019.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear,class
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3,1
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7,1
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9,2
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8,0
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0,0


In [34]:
# Create a scatterplot of df_2019
df_2019.hvplot.scatter(x="gdp", y="freedom", by="class")

In [31]:
# Plotting the clusters with three features
fig = px.scatter_3d(df_2019, x="gdp", y="alcohol_liperyear", z="trust", color="class", symbol="class", size="generosity",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

### Trial and Error

In [47]:
# 2D Scatter plot
df_2019.hvplot.scatter(x="gdp", y="family")


In [48]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_2019
    df["class"] = model.labels_

In [49]:
df_2019.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear,class
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3,2
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7,0
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9,3
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8,0
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0,0


In [53]:
# Take a DataFrame and the number of clusters to make as arguments.
test_cluster_amount(df_2019, 5)
df_2019.hvplot.scatter(x="gdp", y="family", by="class")

In [56]:
# Plot the DataFrame with a third axis  
fig = px.scatter_3d(
    df_2019,
x="gdp",
    y="family",
    z="alcohol_liperyear",
color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

### Elbow Curve

In [57]:
# Loading UML_2019.csv file 
file_path = "../Resources/UML_2019.csv"
df_ElbowCurve = pd.read_csv(file_path)
df_ElbowCurve.head(10)

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0
5,1.362,1.368,0.871,0.536,0.255,0.11,22.1
6,1.246,1.504,0.881,0.334,0.121,0.014,16.6
7,1.206,1.438,0.884,0.483,0.117,0.05,17.1
8,0.745,1.529,0.756,0.631,0.322,0.24,12.8
9,1.238,1.515,0.818,0.291,0.043,0.042,18.9


In [58]:
# Create an empty list to hold inertia values
# store a range of K values we want to test
inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(df_ElbowCurve)
   inertia.append(km.inertia_)

In [59]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [60]:
def get_clusters(k, data):
   # Create a copy of the DataFrame
   data = data.copy()

   # Initialize the K-Means model
   model = KMeans(n_clusters=k, random_state=0)

   # Fit the model
   model.fit(data)

   # Predict clusters
   predictions = model.predict(data)

   # Create return DataFrame with predicted clusters
   data["class"] = model.labels_

   return data

In [61]:
# Run the function for k=2
two_clusters = get_clusters(2, df_ElbowCurve)
two_clusters.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear,class
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3,0
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7,1
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9,1
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8,1
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0,1


In [62]:
# Run the function for k=3
three_clusters = get_clusters(3, df_ElbowCurve)
three_clusters.head()

Unnamed: 0,gdp,family,lifeexpectancy,freedom,generosity,trust,alcohol_liperyear,class
0,1.004,1.439,0.802,0.39,0.099,0.086,19.3,1
1,1.124,1.465,0.891,0.523,0.127,0.15,15.7,1
2,1.572,1.463,1.141,0.556,0.271,0.453,2.9,2
3,0.794,1.242,0.789,0.43,0.093,0.074,12.8,1
4,1.294,1.488,1.039,0.231,0.158,0.03,12.0,1


In [63]:
# Plotting the 2D-Scatter with x="gdp" and y="trust"
two_clusters.hvplot.scatter(x="gdp", y="alcohol_liperyear", by="class")

In [67]:
# Plot the 3D-scatter with x="gdp" and y="family" and z="lifeexpectancy"
fig = px.scatter_3d(
    two_clusters,
    x="gdp",
    y="family",
    z="alcohol_liperyear",
    color="class",
    symbol="class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()