In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Read 2020-2020 data (1205 records). Read mean imputed file or the original file with 0 as imputation
df_hotspot = pd.read_csv('hotspot_meanimputed.csv')

In [None]:
df_hotspot

In [None]:
# Variable Selection
# Level: ADMIN_2_Admnistratif
# Population variables: 'population_totale', 'population_6-59_month'
# Malnutrition prevalence variables: 'gam_prevalence', 'sam_prevalence','mam_prevalence'
# Burden related variables: 'gam_burden', 'sam_burden', 'mam_burden'
# Nutrition + disease variables: 'diarrhee', 'malaria_fever', 'vita', 'deworming','measles', 'ari_cough'
# Conflict +food insecurity variables: 'INFORM_Conflict_Probability', 'Food_Insecurity_Probability', 'Political_violence', 'Conflict_probability', 'Human', 'HAZARD','Recent_Shocks', 'Food_Security','Uprooted_people','Conflict_Intensity'
# Disaster variables: 'Physical_exposure_to_flood', 'Land_Degradation', 'Droughts_probability_and_historical_impact', 'Natural',
# Socio economic variables: 'Development_&_Deprivation', 'Inequality', 'Aid_Dependency','Socio-Economic_Vulnerability',  'Health_Conditions'
# Gov+Infrastructure variables: 'Governance', 'Institutional', 'Communication', 'Physical_infrastructure', 'Access_to_health_care', 'Infrastructure'
# Outcome class: 'priority_level_validated_by_the_clusters'

#Model 1
Malnutrition prevalance

In [None]:
# Selecting population + malnutrition prevalance + burden data
# 'gam_burden', 'sam_burden', 'mam_burden'
df_hotspot_mdl1 = df_hotspot[['year','priority_level_validated_by_the_clusters','ADMIN_2_Admnistratif','gam_prevalence', 'sam_prevalence','mam_prevalence']]
# Impute missing values with 0
df_hotspot_mdl1.fillna(0, inplace=True)
#df_hotspot_mdl1 = df_hotspot_mdl1[df_hotspot_mdl1['year'] == 2020]

In [None]:
df_hotspot_mdl1['priority_level_validated_by_the_clusters'].unique()

In [None]:
# Normalize data
scaler = MinMaxScaler()
mdl1_data = scaler.fit_transform(df_hotspot_mdl1.iloc[:,3:8])
mdl1_data

In [None]:
# Clustering process
# Select number of classes = 4 to match the four classes provided by the expert
from sklearn.cluster import KMeans
mdl1 = KMeans(n_clusters=4)
mdl1.fit(mdl1_data)
print('SSE Mdl1: '+ str(mdl1.inertia_))

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl1['clusteringlabels'] = mdl1.labels_
df_hotspot_mdl1.to_csv('Clustering prevalence.csv', encoding = 'utf-8-sig')
files.download('Clustering prevalence.csv')

In [None]:
np.unique(mdl1.labels_)

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl1['clusteringlabels'] = mdl1.labels_
df_hotspot_mdl1.to_csv('Clustering prevalence.csv', encoding = 'utf-8-sig')
files.download('Clustering prevalence.csv')

# Model 2
### Nutrition + disease variables

In [None]:
# Selecting Nutrition + disease variables
df_hotspot_mdl2 = df_hotspot[['year','priority_level_validated_by_the_clusters','ADMIN_2_Admnistratif','diarrhee', 'malaria_fever', 'vita', 'deworming','measles', 'ari_cough' ]]
# Impute missing values with 0
df_hotspot_mdl2.fillna(0, inplace=True)
#df_hotspot_mdl2 = df_hotspot_mdl2[df_hotspot_mdl2['year'] == 2020]

In [None]:
df_hotspot_mdl2

In [None]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdl2_data = scaler.fit_transform(df_hotspot_mdl2.iloc[:,3:8])
mdl2_data

In [None]:
# Clustering process
# Select number of classes = 4 to match the four classes provided by the expert
mdl2 = KMeans(n_clusters=4)
mdl2.fit(mdl2_data)
print('SSE Mdl1: '+ str(mdl2.inertia_))

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl2['clusteringlabels'] = mdl2.labels_
df_hotspot_mdl2.to_csv('Nutrition + disease data.csv', encoding = 'utf-8-sig')
files.download('Nutrition + disease data.csv')

#Model 3
### Conflict + Food Insecurity

In [None]:
# Conflict +food insecurity variables: 'INFORM_Conflict_Probability', 'Food_Insecurity_Probability', 'Political_violence', 'Conflict_probability', 'Human', 'HAZARD','Recent_Shocks', 'Food_Security','Uprooted_people','Conflict_Intensity'

In [None]:
# Selecting Nutrition + disease variables
df_hotspot_mdl3 = df_hotspot[['year','priority_level_validated_by_the_clusters','ADMIN_2_Admnistratif','INFORM_Conflict_Probability', 'Food_Insecurity_Probability', 'Political_violence', 'Conflict_probability', 'HAZARD','Recent_Shocks', 'Food_Security','Uprooted_people','Conflict_Intensity' ]]
# Impute missing values with 0
# Data type conversion
df_hotspot_mdl3['Food_Security'] = pd.to_numeric(df_hotspot_mdl3['Food_Security'],errors = 'coerce')
df_hotspot_mdl3['Food_Insecurity_Probability'] = pd.to_numeric(df_hotspot_mdl3['Food_Insecurity_Probability'],errors = 'coerce')
# Impute missing values with 0
df_hotspot_mdl3.fillna(0, inplace=True)
#df_hotspot_mdl3 = df_hotspot_mdl3[df_hotspot_mdl3['year'] == 2020]

In [None]:
df_hotspot_mdl3

In [None]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdl3_data = scaler.fit_transform(df_hotspot_mdl3.iloc[:,3:13])
mdl3_data

In [None]:
# Clustering process
# Select number of classes = 4 to match the four classes provided by the expert
mdl3 = KMeans(n_clusters=4)
mdl3.fit(mdl3_data)
print('SSE Mdl3: '+ str(mdl3.inertia_))

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl3['clusteringlabels'] = mdl3.labels_
df_hotspot_mdl3.to_csv('Conflict + Food Insecurity.csv', encoding = 'utf-8-sig')
files.download('Conflict + Food Insecurity.csv')

# Model 4
### Disaster related variables

In [None]:
# Selecting Nutrition + disease variables
df_hotspot_mdl4 = df_hotspot[['year','priority_level_validated_by_the_clusters','ADMIN_2_Admnistratif','Physical_exposure_to_flood', 'Land_Degradation', 'Droughts_probability_and_historical_impact', 'Natural']]
# Impute missing values with 0
df_hotspot_mdl4.fillna(0, inplace=True)
#df_hotspot_mdl4 = df_hotspot_mdl4[df_hotspot_mdl4['year'] == 2020]

In [None]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdl4_data = scaler.fit_transform(df_hotspot_mdl4.iloc[:,3:6])
mdl4_data

In [None]:
# Clustering process
# Select number of classes = 4 to match the four classes provided by the expert
mdl4 = KMeans(n_clusters=4)
mdl4.fit(mdl4_data)
print('SSE Mdl1: '+ str(mdl4.inertia_))

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl4['clusteringlabels'] = mdl4.labels_
df_hotspot_mdl4.to_csv('Disaster variables.csv', encoding = 'utf-8-sig')
files.download('Disaster variables.csv')

#Model 5
### Socio economic + Govt + infrastructure variables

In [None]:
# Selecting Nutrition + disease variables
df_hotspot_mdl5 = df_hotspot[['year','priority_level_validated_by_the_clusters','ADMIN_2_Admnistratif','Development_&_Deprivation', 'Inequality', 'Aid_Dependency','Socio-Economic_Vulnerability',  'Health_Conditions'
, 'Governance', 'Institutional', 'Communication', 'Physical_infrastructure', 'Access_to_health_care', 'Infrastructure' ]]
# Impute missing values with 0
df_hotspot_mdl5.fillna(0, inplace=True)
#df_hotspot_mdl5 = df_hotspot_mdl5[df_hotspot_mdl5['year'] == 2020]

In [None]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdl5_data = scaler.fit_transform(df_hotspot_mdl5.iloc[:,3:14])
mdl5_data

In [None]:
# Clustering process
# Select number of classes = 4 to match the four classes provided by the expert
mdl5 = KMeans(n_clusters=4)
mdl5.fit(mdl5_data)
print('SSE Mdl5: '+ str(mdl5.inertia_))

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl5['clusteringlabels'] = mdl5.labels_
df_hotspot_mdl5.to_csv('Socio economic + Govt + infrastructure.csv', encoding = 'utf-8-sig')
files.download('Socio economic + Govt + infrastructure.csv')

# model 6
### ALL VARIABLES (EXCEPT MALNUTRITION)

In [None]:
df_hotspot_mdl = df_hotspot[['year','priority_level_validated_by_the_clusters','ADMIN_2_Admnistratif', 'diarrhee', 'malaria_fever', 'vita', 'deworming','measles', 'ari_cough' ,'INFORM_Conflict_Probability', 'Food_Insecurity_Probability', 'Political_violence', 'Conflict_probability', 'HAZARD','Recent_Shocks', 'Food_Security','Uprooted_people','Conflict_Intensity', 'Physical_exposure_to_flood', 'Land_Degradation', 'Droughts_probability_and_historical_impact', 'Natural', 'Development_&_Deprivation', 'Inequality', 'Aid_Dependency','Socio-Economic_Vulnerability',  'Health_Conditions' ,'Governance', 'Institutional', 'Communication', 'Physical_infrastructure', 'Access_to_health_care', 'Infrastructure']]


In [None]:
df_hotspot_mdl

In [None]:
# Data type conversion
df_hotspot_mdl['Food_Security'] = pd.to_numeric(df_hotspot_mdl['Food_Security'],errors = 'coerce')
df_hotspot_mdl['Food_Insecurity_Probability'] = pd.to_numeric(df_hotspot_mdl['Food_Insecurity_Probability'],errors = 'coerce')

In [None]:
df_hotspot_mdl.dtypes

In [None]:
df_hotspot_mdl.fillna(0, inplace=True)
#df_hotspot_mdl = df_hotspot_mdl[df_hotspot_mdl['year'] == 2020]

In [None]:
df_hotspot_mdl['priority_level_validated_by_the_clusters'].unique()

In [None]:
# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
mdl_data = scaler.fit_transform(df_hotspot_mdl.iloc[:,3:42])
mdl_data

In [None]:
# Clustering process
# Select number of classes = 4 to match the four classes provided by the expert
from sklearn.cluster import KMeans
mdl = KMeans(n_clusters=4)
mdl.fit(mdl_data)
print('SSE Mdl: '+ str(mdl.inertia_))

In [None]:
# Unsupervised clusters
# Use this result to validate priority level provided by expert and clustering solution provided by the kmeans algorithm
df_hotspot_mdl['clusteringlabels'] = mdl.labels_
df_hotspot_mdl.to_csv('All variables(except malnutrition).csv', encoding = 'utf-8-sig')
files.download('All variables(except malnutrition).csv')