<a href="https://colab.research.google.com/github/uscudum/ML-ANS-Clustering-KMeans-Country/blob/main/Clustering_Localidades.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Aprendizaje no supervisado - Clustering

### Dataset:
https://www.kaggle.com/datasets/rohan0301/unsupervised-learning-on-country-data

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
# Cargamos DataFrame en una variable
df = pd.read_csv('Country-data.csv')
print(df.head())

               country  child_mort  exports  health  imports  income  \
0          Afghanistan        90.2     10.0    7.58     44.9    1610   
1              Albania        16.6     28.0    6.55     48.6    9930   
2              Algeria        27.3     38.4    4.17     31.4   12900   
3               Angola       119.0     62.3    2.85     42.9    5900   
4  Antigua and Barbuda        10.3     45.5    6.03     58.9   19100   

   inflation  life_expec  total_fer   gdpp  
0       9.44        56.2       5.82    553  
1       4.49        76.3       1.65   4090  
2      16.10        76.5       2.89   4460  
3      22.40        60.1       6.16   3530  
4       1.44        76.8       2.13  12200  


In [4]:
# En nuestras variables independientes almacenamos todas las columnas
# excepto las localidades, ya que sus ítems no son numéricos
X = df.drop(columns=['country'])
print(X.head())

   child_mort  exports  health  imports  income  inflation  life_expec  \
0        90.2     10.0    7.58     44.9    1610       9.44        56.2   
1        16.6     28.0    6.55     48.6    9930       4.49        76.3   
2        27.3     38.4    4.17     31.4   12900      16.10        76.5   
3       119.0     62.3    2.85     42.9    5900      22.40        60.1   
4        10.3     45.5    6.03     58.9   19100       1.44        76.8   

   total_fer   gdpp  
0       5.82    553  
1       1.65   4090  
2       2.89   4460  
3       6.16   3530  
4       2.13  12200  


In [5]:
# Escalamos los datos
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Creamos el modelo
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_std)

  super()._check_params_vs_input(X, default_n_init=10)


In [6]:
# Añadimos al DataFrame una nueva columna, con los resultados
# de la clusterización
df['cluster'] = kmeans.labels_

In [7]:
print(df)

                 country  child_mort  exports  health  imports  income  \
0            Afghanistan        90.2     10.0    7.58     44.9    1610   
1                Albania        16.6     28.0    6.55     48.6    9930   
2                Algeria        27.3     38.4    4.17     31.4   12900   
3                 Angola       119.0     62.3    2.85     42.9    5900   
4    Antigua and Barbuda        10.3     45.5    6.03     58.9   19100   
..                   ...         ...      ...     ...      ...     ...   
162              Vanuatu        29.2     46.6    5.25     52.7    2950   
163            Venezuela        17.1     28.5    4.91     17.6   16500   
164              Vietnam        23.3     72.0    6.84     80.2    4490   
165                Yemen        56.3     30.0    5.18     34.4    4480   
166               Zambia        83.1     37.0    5.89     30.9    3280   

     inflation  life_expec  total_fer   gdpp  cluster  
0         9.44        56.2       5.82    553        2  

In [8]:
# Visualizamos y analizamos coincidencias en las primeras 60 filas
print(df[['country','cluster']].head(60))

                     country  cluster
0                Afghanistan        2
1                    Albania        0
2                    Algeria        0
3                     Angola        2
4        Antigua and Barbuda        0
5                  Argentina        0
6                    Armenia        0
7                  Australia        1
8                    Austria        1
9                 Azerbaijan        0
10                   Bahamas        0
11                   Bahrain        1
12                Bangladesh        0
13                  Barbados        0
14                   Belarus        0
15                   Belgium        1
16                    Belize        0
17                     Benin        2
18                    Bhutan        0
19                   Bolivia        0
20    Bosnia and Herzegovina        0
21                  Botswana        2
22                    Brazil        0
23                    Brunei        1
24                  Bulgaria        0
25          

In [9]:
# Filtro las filas para observar en qué cluster se ubicó Uruguay
print(df[df['country']=='Uruguay'])

     country  child_mort  exports  health  imports  income  inflation  \
160  Uruguay        10.6     26.3    8.35     25.4   17100       4.91   

     life_expec  total_fer   gdpp  cluster  
160        76.4       2.08  11900        0  


In [11]:
# Filtro solamente las columnas que me interesan de la fila de Uruguay
print(df[df['country']=='Uruguay'][['country','cluster']])

     country  cluster
160  Uruguay        0
