<a href="https://colab.research.google.com/github/thhelen/-Predictive-Modeling-for-State-Fragility-Assessment/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import plotly.express as px
from ipywidgets import interact, IntSlider

In [36]:
file_path = '/content/combined_train_clean.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0.1,Unnamed: 0,Country,Year,Rank,Total,C1: Security Apparatus,C2: Factionalized Elites,C3: Group Grievance,E1: Economy,E2: Economic Inequality,E3: Human Flight and Brain Drain,P1: State Legitimacy,P2: Public Services,P3: Human Rights,S1: Demographic Pressures,S2: Refugees and IDPs,X1: External Intervention,Change from Previous Year
0,0,Sudan,2006,1st,112.3,9.8,9.1,9.7,7.5,9.2,9.1,9.5,9.5,9.8,9.6,9.7,9.8,
1,1,Congo Democratic Republic,2006,2nd,110.1,9.8,9.6,9.1,8.1,9.0,8.0,9.0,9.0,9.5,9.5,9.5,10.0,
2,2,Cote d'Ivoire,2006,3rd,109.2,9.8,9.8,9.8,9.0,8.0,8.5,10.0,8.5,9.4,8.8,7.6,10.0,
3,3,Iraq,2006,4th,109.0,9.8,9.7,9.8,8.2,8.7,9.1,8.5,8.3,9.7,8.9,8.3,10.0,
4,4,Zimbabwe,2006,5th,108.9,9.4,8.5,8.5,9.8,9.2,9.0,8.9,9.5,9.5,9.7,8.9,8.0,


In [37]:
continent_mapping = {
    'Africa': [
        'Nigeria','Congo Democratic Republic',"Cote d'Ivoire", 'Ethiopia','Guinea Bissau', 'Egypt', 'DR Congo', 'Tanzania',
        'South Africa', 'Kenya', 'Uganda', 'Sudan', 'Algeria',
        'Morocco', 'Angola', 'Ghana', 'Mozambique', 'Madagascar',
        'Côte d\'Ivoire', 'Cameroon', 'Niger', 'Mali', 'Burkina Faso',
        'Malawi', 'Zambia', 'Chad', 'Somalia', 'Senegal',
        'Zimbabwe', 'Guinea', 'Rwanda', 'Benin', 'Burundi',
        'Tunisia', 'South Sudan', 'Togo', 'Sierra Leone', 'Libya',
        'Congo', 'Central African Republic', 'Liberia', 'Mauritania', 'Eritrea',
        'Gambia', 'Botswana', 'Namibia', 'Gabon', 'Lesotho',
        'Guinea-Bissau', 'Equatorial Guinea', 'Mauritius', 'Eswatini', 'Djibouti',
        'Comoros', 'Cabo Verde', 'Sao Tome & Principe', 'Seychelles'
    ],'Asia': [
        'India', 'China', 'Indonesia', 'Pakistan', 'Bangladesh',
        'Japan', 'Philippines', 'Vietnam', 'Iran', 'Turkey',
        'Thailand', 'Myanmar', 'South Korea', 'Iraq', 'Afghanistan',
        'Saudi Arabia', 'Uzbekistan', 'Yemen', 'Malaysia', 'Nepal',
        'North Korea', 'Syria', 'Sri Lanka', 'Kazakhstan', 'Cambodia',
        'Jordan', 'Azerbaijan', 'Tajikistan', 'United Arab Emirates', 'Israel',
        'Laos', 'Kyrgyzstan', 'Turkmenistan', 'Singapore', 'State of Palestine',
        'Lebanon', 'Oman', 'Kuwait', 'Georgia', 'Mongolia',
        'Armenia', 'Qatar', 'Bahrain', 'Timor-Leste', 'Cyprus',
        'Bhutan', 'Maldives', 'Brunei'
    ],'Europe': [
        'Russia', 'Germany', 'United Kingdom', 'France', 'Italy',
        'Spain', 'Poland', 'Ukraine', 'Romania', 'Netherlands',
        'Belgium', 'Sweden', 'Czech Republic (Czechia)', 'Greece', 'Portugal',
        'Hungary', 'Belarus', 'Austria', 'Switzerland', 'Serbia',
        'Bulgaria', 'Denmark', 'Slovakia', 'Finland', 'Norway',
        'Ireland', 'Croatia', 'Moldova', 'Bosnia and Herzegovina', 'Albania',
        'Lithuania', 'Slovenia', 'North Macedonia', 'Latvia', 'Estonia',
        'Luxembourg', 'Montenegro', 'Malta', 'Iceland', 'Andorra',
        'Liechtenstein', 'Monaco', 'San Marino', 'Holy See'
    ],'Oceania': [
        'Australia', 'Papua New Guinea', 'New Zealand', 'Fiji',
        'Solomon Islands', 'Micronesia', 'Vanuatu', 'Samoa',
        'Kiribati', 'Tonga', 'Marshall Islands', 'Palau',
        'Nauru', 'Tuvalu'
    ],'North America': [
        'United States', 'Mexico', 'Canada', 'Guatemala', 'Haiti',
        'Dominican Republic', 'Cuba', 'Honduras', 'Nicaragua', 'El Salvador',
        'Costa Rica', 'Panama', 'Puerto Rico', 'Jamaica', 'Trinidad and Tobago',
        'Bahamas', 'Belize', 'Guadeloupe', 'Martinique', 'Barbados',
        'Curaçao', 'Saint Lucia', 'Grenada', 'Aruba', 'Saint Vincent and the Grenadines',
        'United States Virgin Islands', 'Antigua and Barbuda', 'Dominica', 'Cayman Islands', 'Bermuda',
        'Greenland', 'Saint Kitts and Nevis', 'Turks and Caicos Islands', 'Sint Maarten', 'Saint Martin',
        'British Virgin Islands', 'Caribbean Netherlands', 'Anguilla', 'Saint Barthélemy', 'Saint Pierre and Miquelon',
        'Montserrat'
    ],'South America': [
        'Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia',
        'Ecuador', 'Falkland Islands', 'French Guiana', 'Guyana',
        'Paraguay', 'Peru', 'Suriname', 'Uruguay', 'Venezuela'
    ]}

In [38]:
country_to_continent = {country: continent for continent, countries in continent_mapping.items() for country in countries}

data['Continent'] = data['Country'].map(country_to_continent)

In [39]:
unique_nan_countries = data[data['Continent'].isna()]['Country'].unique()

unique_nan_countries

array([], dtype=object)

In [40]:
data.head()

Unnamed: 0.1,Unnamed: 0,Country,Year,Rank,Total,C1: Security Apparatus,C2: Factionalized Elites,C3: Group Grievance,E1: Economy,E2: Economic Inequality,E3: Human Flight and Brain Drain,P1: State Legitimacy,P2: Public Services,P3: Human Rights,S1: Demographic Pressures,S2: Refugees and IDPs,X1: External Intervention,Change from Previous Year,Continent
0,0,Sudan,2006,1st,112.3,9.8,9.1,9.7,7.5,9.2,9.1,9.5,9.5,9.8,9.6,9.7,9.8,,Africa
1,1,Congo Democratic Republic,2006,2nd,110.1,9.8,9.6,9.1,8.1,9.0,8.0,9.0,9.0,9.5,9.5,9.5,10.0,,Africa
2,2,Cote d'Ivoire,2006,3rd,109.2,9.8,9.8,9.8,9.0,8.0,8.5,10.0,8.5,9.4,8.8,7.6,10.0,,Africa
3,3,Iraq,2006,4th,109.0,9.8,9.7,9.8,8.2,8.7,9.1,8.5,8.3,9.7,8.9,8.3,10.0,,Asia
4,4,Zimbabwe,2006,5th,108.9,9.4,8.5,8.5,9.8,9.2,9.0,8.9,9.5,9.5,9.7,8.9,8.0,,Africa


In [41]:
# Indicator columns for each category
security_indicators = ['C1: Security Apparatus', 'C2: Factionalized Elites', 'C3: Group Grievance']
economy_indicators = ['E1: Economy', 'E2: Economic Inequality', 'E3: Human Flight and Brain Drain']
politics_indicators = ['P1: State Legitimacy', 'P2: Public Services', 'P3: Human Rights']
society_indicators = ['S1: Demographic Pressures', 'S2: Refugees and IDPs', 'X1: External Intervention']

# Averages for each category
data['Average Security'] = data[security_indicators].mean(axis=1)
data['Average Economy'] = data[economy_indicators].mean(axis=1)
data['Average Politics'] = data[politics_indicators].mean(axis=1)
data['Average Society'] = data[society_indicators].mean(axis=1)

In [42]:
data[['Country','Continent', 'Year', 'Average Security', 'Average Economy', 'Average Politics', 'Average Society']].head()

Unnamed: 0,Country,Continent,Year,Average Security,Average Economy,Average Politics,Average Society
0,Sudan,Africa,2006,9.533333,8.6,9.6,9.7
1,Congo Democratic Republic,Africa,2006,9.5,8.366667,9.166667,9.666667
2,Cote d'Ivoire,Africa,2006,9.8,8.5,9.3,8.8
3,Iraq,Asia,2006,9.766667,8.666667,8.833333,9.066667
4,Zimbabwe,Africa,2006,8.8,9.333333,9.3,8.866667


In [43]:
def cluster_and_visualize(data, year):
    filtered_data = data[data['Year'] == year].copy()

    features = ['Average Security', 'Average Economy', 'Average Politics', 'Average Society']
    x = filtered_data.loc[:, features].values

    x = StandardScaler().fit_transform(x)

    # Performing PCA to reduce dimensions to two
    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(x)

    filtered_data['PCA-2d-one'] = pca_results[:, 0]
    filtered_data['PCA-2d-two'] = pca_results[:, 1]

    # KMeans clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    filtered_data['Cluster'] = kmeans.fit_predict(pca_results)

    cluster_centers = filtered_data.groupby('Cluster')['PCA-2d-one'].mean().sort_values()

    color_map = {cluster_centers.index[0]: 'green',
                 cluster_centers.index[1]: 'light yellow',
                 cluster_centers.index[2]: 'red'}

    filtered_data['Color'] = filtered_data['Cluster'].map(color_map)

    fig = px.scatter(filtered_data, x='PCA-2d-one', y='PCA-2d-two',
                     color='Color',
                     hover_data=['Continent', 'Country'],
                     text='Country',
                     color_discrete_map=color_map)

    fig.update_layout(
        title=f'Clustering of Countries Based on Indicators in {year}',
        showlegend=False
    )

    fig.update_traces(marker=dict(size=10),
                      textposition='top center',
                      textfont=dict(size=10))

    fig.show()

In [44]:
cluster_and_visualize(data, 2023)

In [45]:
# The features for clustering are 'C1: Security Apparatus', 'C2: Factionalized Elites', 'C3: Group Grievance'.

def cluster_and_visualize_2(data, year):
    filtered_data = data[data['Year'] == year].copy()

    features = ['C1: Security Apparatus', 'C2: Factionalized Elites', 'C3: Group Grievance']
    x = filtered_data.loc[:, features].values

    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(x)

    filtered_data['PCA-2d-one'] = pca_results[:, 0]
    filtered_data['PCA-2d-two'] = pca_results[:, 1]

    # KMeans clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    filtered_data['Cluster'] = kmeans.fit_predict(pca_results)

    ordered_clusters = filtered_data.groupby('Cluster')['PCA-2d-one'].mean().sort_values().index
    color_order = {old_label: new_label for new_label, old_label in enumerate(ordered_clusters)}

    filtered_data['Cluster_Label'] = filtered_data['Cluster'].map(color_order)
    color_map = {0: 'green', 1: 'yellow', 2: 'red'}

    fig = px.scatter(filtered_data, x='PCA-2d-one', y='PCA-2d-two',
                     color='Cluster_Label',
                     hover_data=['Country'],
                     text='Country',
                     color_discrete_map=color_map)

    fig.update_layout(title=f'Clustering of Countries Based on COHESION INDICATORS in {year}',
        showlegend=False
    )

    fig.update_traces(marker=dict(size=10),
                      textposition='top center',
                      textfont=dict(size=10))

    fig.show()

In [46]:
cluster_and_visualize_2(data, 2023)

In [47]:
def cluster_and_visualize_3(data, year):
    filtered_data = data[data['Year'] == year].copy()

    features = ['E1: Economy', 'E2: Economic Inequality', 'E3: Human Flight and Brain Drain']
    x = filtered_data.loc[:, features].values

    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(x)

    filtered_data['PCA-2d-one'] = pca_results[:, 0]
    filtered_data['PCA-2d-two'] = pca_results[:, 1]

    # KMeans clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    filtered_data['Cluster'] = kmeans.fit_predict(pca_results)

    ordered_clusters = filtered_data.groupby('Cluster')['PCA-2d-one'].mean().sort_values().index
    color_order = {old_label: new_label for new_label, old_label in enumerate(ordered_clusters)}

    filtered_data['Cluster_Label'] = filtered_data['Cluster'].map(color_order)
    color_map = {0: 'green', 1: 'yellow', 2: 'red'}

    fig = px.scatter(filtered_data, x='PCA-2d-one', y='PCA-2d-two',
                     color='Cluster_Label',
                     hover_data=['Country'],
                     text='Country',
                     color_discrete_map=color_map)

    fig.update_layout(title=f'Clustering of Countries Based on ECONOMIC INDICATORS in {year}',
        showlegend=False
    )

    fig.update_traces(marker=dict(size=10),
                      textposition='top center',
                      textfont=dict(size=10))
    fig.show()

In [48]:
cluster_and_visualize_3(data, 2023)

In [49]:
def cluster_and_visualize_4(data, year):
    filtered_data = data[data['Year'] == year].copy()

    features = ['P1: State Legitimacy',	'P2: Public Services', 'P3: Human Rights']
    x = filtered_data.loc[:, features].values

    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(x)

    filtered_data['PCA-2d-one'] = pca_results[:, 0]
    filtered_data['PCA-2d-two'] = pca_results[:, 1]

    # KMeans clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    filtered_data['Cluster'] = kmeans.fit_predict(pca_results)

    # Determine the order of the clusters based on their mean PCA-2d-one value
    ordered_clusters = filtered_data.groupby('Cluster')['PCA-2d-one'].mean().sort_values().index
    color_order = {old_label: new_label for new_label, old_label in enumerate(ordered_clusters)}

    filtered_data['Cluster_Label'] = filtered_data['Cluster'].map(color_order)
    color_map = {0: 'green', 1: 'yellow', 2: 'red'}

    fig = px.scatter(filtered_data, x='PCA-2d-one', y='PCA-2d-two',
                     color='Cluster_Label',
                     hover_data=['Country'],
                     text='Country',
                     color_discrete_map=color_map)

    fig.update_layout(title=f'Clustering of Countries Based on POLITICAL INDICATORS in {year}',
        showlegend=False
    )

    fig.update_traces(marker=dict(size=10),
                      textposition='top center',
                      textfont=dict(size=10))
    fig.show()

In [50]:
cluster_and_visualize_4(data, 2023)

In [51]:
def cluster_and_visualize_5(data, year):
    filtered_data = data[data['Year'] == year].copy()

    features = ['S1: Demographic Pressures',	'S2: Refugees and IDPs',	'X1: External Intervention']
    x = filtered_data.loc[:, features].values

    x = StandardScaler().fit_transform(x)

    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(x)

    filtered_data['PCA-2d-one'] = pca_results[:, 0]
    filtered_data['PCA-2d-two'] = pca_results[:, 1]

    kmeans = KMeans(n_clusters=3, random_state=42)
    filtered_data['Cluster'] = kmeans.fit_predict(pca_results)

    ordered_clusters = filtered_data.groupby('Cluster')['PCA-2d-one'].mean().sort_values().index
    color_order = {old_label: new_label for new_label, old_label in enumerate(ordered_clusters)}

    filtered_data['Cluster_Label'] = filtered_data['Cluster'].map(color_order)
    color_map = {0: 'green', 1: 'yellow', 2: 'red'}

    fig = px.scatter(filtered_data, x='PCA-2d-one', y='PCA-2d-two',
                     color='Cluster_Label',
                     hover_data=['Country'],
                     text='Country',
                     color_discrete_map=color_map)

    fig.update_layout(title=f'Clustering of Countries Based on SOCIAL INDICATORS in {year}',
        showlegend=False
    )

    fig.update_traces(marker=dict(size=10),
                      textposition='top center',
                      textfont=dict(size=10))

    fig.show()

In [52]:
cluster_and_visualize_5(data, 2023)