In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import SparsePCA
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, MeanShift, DBSCAN
from sklearn.mixture import GaussianMixture
import joblib

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Load cleaned, complete dataset

In [3]:
df = pd.read_csv("complete_World_development_mesurement.csv")
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Birth Rate', 'Business Tax Rate',
       'CO2 Emissions', 'Country', 'Days to Start Business',
       'Ease of Business', 'Energy Usage', 'GDP', 'Health Exp % GDP',
       'Health Exp/Capita', 'Hours to do Tax', 'Infant Mortality Rate',
       'Internet Usage', 'Lending Interest', 'Life Expectancy Female',
       'Life Expectancy Male', 'Mobile Phone Usage', 'Number of Records',
       'Population 0-14', 'Population 15-64', 'Population 65+',
       'Population Total', 'Population Urban', 'Tourism Inbound',
       'Tourism Outbound'],
      dtype='object')

In [4]:
# Preprocessing + SparsePCA
preprocess = Pipeline([    
    ('scaler', StandardScaler()),                    # normalize values
    ('spca',   SparsePCA(n_components=2, alpha=0.01, random_state=42))  # dimensionality reduction
])

### Economic data

In [5]:
#df.drop(columns=['Unnamed: 0.1','Unnamed: 0'], inplace=True)
old_df = df

economic_df = df[['Business Tax Rate', 
       'Days to Start Business', 'Ease of Business', 'Energy Usage', 'GDP',
        'Hours to do Tax',
        'Internet Usage', 'Lending Interest',
        'Mobile Phone Usage',
         'Population 15-64',
       'Population Total',
       'Tourism Inbound', 'Tourism Outbound'] ]
economic_df

Unnamed: 0,Business Tax Rate,Days to Start Business,Ease of Business,Energy Usage,GDP,Hours to do Tax,Internet Usage,Lending Interest,Mobile Phone Usage,Population 15-64,Population Total,Tourism Inbound,Tourism Outbound
0,729.000,25.000,151.0,26998.0,5.479006e+10,451.0,0.0,0.0825,0.0,0.619,31719449.0,1.020000e+08,1.930000e+08
1,521.000,93.500,178.0,7499.0,9.129595e+09,272.0,0.0,1.0320,0.0,0.499,13924930.0,3.400000e+07,1.460000e+08
2,732.000,31.000,175.0,1983.0,2.359122e+09,270.0,0.0,0.0000,0.0,0.517,6949366.0,7.700000e+07,5.000000e+07
3,171.000,92.000,65.0,1836.0,5.788312e+09,140.0,0.0,0.1550,0.1,0.587,1755375.0,2.270000e+08,2.090000e+08
4,448.500,17.000,154.0,0.0,2.610959e+09,270.0,0.0,0.0000,0.0,0.505,11607944.0,2.300000e+07,3.000000e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2569,279.000,694.000,165.0,705.0,5.012121e+09,199.0,0.3,0.1170,1.1,0.656,534541.0,7.900000e+07,5.800000e+07
2570,291.375,41.000,63.0,18221.5,2.343634e+10,210.0,0.6,0.0770,1.4,0.705,1337439.0,5.525000e+08,1.505000e+08
2571,464.000,5.000,4.0,2132446.0,1.624460e+13,175.0,0.8,0.0330,1.0,0.667,313873685.0,2.000920e+11,1.265730e+11
2572,419.000,40.625,85.0,3132.0,5.000435e+10,310.0,0.5,0.1120,1.5,0.639,3395253.0,2.222000e+09,7.075000e+08


In [6]:
#X_transformed = preprocess.fit_transform(X)
scaled_economic_data = preprocess.fit_transform(economic_df)
scaled_economic_data.shape, economic_df.shape

((2574, 2), (2574, 13))

### Health data

In [7]:
health_df = df[['Birth Rate', 
       'CO2 Emissions',  
       'Energy Usage','Health Exp % GDP',
       'Health Exp/Capita', 'Infant Mortality Rate',
       'Internet Usage',  'Life Expectancy Female',
       'Life Expectancy Male', 'Mobile Phone Usage',
       'Population 0-14', 'Population 15-64', 'Population 65+',
       'Population Total', 'Population Urban']]

In [8]:
scaled_health_data = preprocess.fit_transform(health_df)
scaled_health_data.shape, health_df.shape

((2574, 2), (2574, 15))

### Population data

In [9]:
population_df = df[[ 'Birth Rate',               
       'Infant Mortality Rate',
       'Internet Usage', 'Life Expectancy Female',
       'Life Expectancy Male', 'Mobile Phone Usage', 
       'Population 0-14', 'Population 15-64', 'Population 65+',
       'Population Total', 'Population Urban', 'Tourism Inbound',
       'Tourism Outbound']]

In [10]:
scaled_population_data = preprocess.fit_transform(population_df)
scaled_population_data.shape, population_df.shape

((2574, 2), (2574, 13))

In [11]:
# Multiple clustering models
models = {
    'KMeans'       : KMeans(n_clusters=3, random_state=42),
    'GMM'          : GaussianMixture(n_components=3, random_state=42),
    'Agglomerative': AgglomerativeClustering(n_clusters=3, linkage='ward'),
    'Spectral'     : SpectralClustering(n_clusters=3, affinity='nearest_neighbors', random_state=42),
    'MeanShift'    : MeanShift(),
    'DBSCAN'       : DBSCAN(eps=0.5, min_samples=5)
}

# Fit each clustering model
fitted_models = {}
for name, model in models.items():
    fitted_models[name] = model.fit(scaled_economic_data)

In [12]:
# Save everything (preprocessing + fitted models) in one dictionary
pipeline_bundle = {
    'preprocess':      preprocess,
    'economic_data':   scaled_economic_data,
    'population_data': scaled_population_data,
    'health_data' :    scaled_health_data,
    'models':          models
}

In [13]:
joblib.dump(pipeline_bundle, 'clustering_pipeline.pkl')

['clustering_pipeline.pkl']