In [57]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn import metrics

<h1> part 1 from email </h1>
<h2> determine which features contribute to the components </h2>

In [63]:
# Data import
df=pd.read_csv('atussum_1921-reduced.csv')


In [61]:
# Create feature list and target
features = []
for i in range(1,17):
    features.append('t%02d'%i)

features.append('t18')
features.append('t50')

target = ['TESEX']

In [64]:
# Data preprocessing
# rows for 2019
df_19 = df[df['TUYEAR'] == 2019]
# rows for 2021
df_21 = df[df['TUYEAR'] == 2021]

# selecting cols that only contains the feature
x_19 = df_19.loc[:, features]
x_21 = df_21.loc[:, features]
# normaliza x data returned nparray
x_19_scaled = StandardScaler().fit_transform(x_19.values)
x_21_scaled = StandardScaler().fit_transform(x_21.values)
# reassemble nparray back to dataframe
x_19_df = pd.DataFrame(x_19_scaled,index=x_19.index, columns=x_19.columns)
x_21_df = pd.DataFrame(x_21_scaled,index=x_21.index, columns=x_21.columns)

# selecting target TODO 
y = df.loc[:,target].values

# Deleting original df because we don't need it anymore
del df

In [65]:
# Perfrom pca
pca = PCA(n_components=2)
pca_2 = PCA(n_components=2)

# 2019
pca_19 = pca.fit_transform(x_19)
print(pca.explained_variance_ratio_)
pca_19_df = pd.DataFrame(data = pca_19
             , columns = ['principal component 1', 'principal component 2'])
# 2021
pca_21 = pca_2.fit_transform(x_21)
print(pca_2.explained_variance_ratio_)
pca_21_df = pd.DataFrame(data = pca_21
             , columns = ['principal component 1', 'principal component 2'])

[0.43633019 0.21009205]
[0.44347902 0.2161825 ]


In [66]:
# Analysis results from pca
dataset_pca_19 = pd.DataFrame(abs(pca.components_),columns=x_19_df.columns,index=['PC_1', 'PC_2'])
dataset_pca_21 = pd.DataFrame(abs(pca_2.components_),columns=x_21_df.columns,index=['PC_1', 'PC_2'])

<h2> year 2019 </h2>

In [67]:
# Filter out the feature has higher impact to the component with threshold 0.3
print("\n*************** Most important features *************************")
print('As per PC 1:\n', (dataset_pca_19[dataset_pca_19 > 0.3].iloc[0]).dropna())   
print('\n\nAs per PC 2:\n', (dataset_pca_19[dataset_pca_19 > 0.3].iloc[1]).dropna())
print("\n******************************************************************")


*************** Most important features *************************
As per PC 1:
 t05    0.767699
t12    0.625132
Name: PC_1, dtype: float64


As per PC 2:
 t01    0.356535
t02    0.411126
t05    0.470627
t12    0.681885
Name: PC_2, dtype: float64

******************************************************************


<h2> year 2021 </h2>

In [68]:
# Filter out the feature has higher impact to the component with threshold 0.3
print("\n*************** Most important features *************************")
print('As per PC 1:\n', (dataset_pca_21[dataset_pca_21 > 0.3].iloc[0]).dropna())   
print('\n\nAs per PC 2:\n', (dataset_pca_21[dataset_pca_21 > 0.3].iloc[1]).dropna())
print("\n******************************************************************")


*************** Most important features *************************
As per PC 1:
 t05    0.773376
t12    0.620138
Name: PC_1, dtype: float64


As per PC 2:
 t01    0.301223
t02    0.472248
t05    0.460071
t12    0.676540
Name: PC_2, dtype: float64

******************************************************************


Selecting no of clusters based on silhoutte score method 

In [69]:
def get_optimum_n_clusters(data):
  n_clusters_candidates = [3,4,5,6,7,8,9,10,12,15,20]
  parameter_grid = ParameterGrid({'n_clusters': n_clusters_candidates})

  kmeans_model = KMeans() 
  silhouette_scores = []
  best_score = -1

  for p in parameter_grid:
    kmeans_model.set_params(**p)
    kmeans_model.fit(data)

    ss = metrics.silhouette_score(data, kmeans_model.labels_)
    silhouette_scores += [ss]

    if ss > best_score:
      best_score = ss
      best_grid = p

  return best_grid['n_clusters']

optimum_clusters_19 = get_optimum_n_clusters(pca_19_df)
optimum_clusters_21 = get_optimum_n_clusters(pca_21_df)

Applying K-means 

In [None]:
# Applyting k means to 2019 data 
kmeans_19 = KMeans(n_clusters=optimum_clusters_19)
kmeans_19.fit(pca_19_df)

#Applying k means to 2021 data 
kmeans_21 = KMeans(n_clusters=optimum_clusters_21)
kmeans_21.fit(pca_21_df)

Adding K-means labels to df_19 and df_21 for visualizations 

In [71]:
df_19.loc[:, 'cluster_labels'] = kmeans_19.labels_
df_21.loc[:, 'cluster_labels'] = kmeans_21.labels_