In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn import metrics
from scipy import stats
from statistics import mode

#Importing libraries for model creation
from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import VotingClassifier 

#Importing pre-processing
from sklearn import preprocessing
#Decomposition
from sklearn.decomposition import PCA
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-jul-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-jul-2022/data.csv


## Problem Approach

In this notebook we're tasked with clustering an unlabeled dataset with the evaluation metric being the RandScore. For our model we compare a Gaussian Mixture with a Bayesian Gaussian mixture and end up setting on the Bayesian Gaussian for the final submission. The overall approach is as follows:

* Data Loading
* Data Normalization
* Feature Selection
* Determining Optimal Number of Clusters
* Training models: Gaussian Mixture, Bayesian Gausian Mixture, Mini K means
* Ensembling predictions
* Submitting result from ensembled model

## Loading Data

In [2]:
# df = pd.read_csv('../input/tabular-playground-series-jul-2022/data.csv', index_col=False)
# df = df.fillna(0)
df = pd.read_csv('/kaggle/input/tabular-playground-series-jul-2022/data.csv', index_col=False)
df = df.fillna(0)

## Data Scaling

A Gaussian Mixture model assumes that each variable follows a gaussian distribution. Examining the dataset below we can see this isnt the case with many variables being skewed. In order to correct this I used the  sklearn power transformer implementing the yeo-johnson method, this method doesnt require all datapoints to be positive (like a Box-Cox transform) and resulted in a better score than other transformerms.

In [3]:
#Shaping to appropriate format
df_copy = df.drop(columns = ['id'])

scaler = preprocessing.PowerTransformer(method = 'yeo-johnson', standardize=True).fit(df_copy.values)
scaled_df = pd.DataFrame(scaler.transform(df_copy.values), index = df_copy.index, columns = df_copy.columns)
# p_vals = []
# for col in df_copy.columns:
#     pre_transform = stats.shapiro(df[col]).pvalue
#     post_transform = stats.shapiro(scaled_df[col]).pvalue
#     p_vals.append([col, pre_transform, post_transform])

# p_val_df = pd.DataFrame(p_vals, columns = ['Variable', 'Pre-Transform', 'Post-Transform'])
# print(p_val_df.sort_values(by=['Pre-Transform']))


# melted_df_pre = df_copy.melt(value_vars = df_copy.columns,
#                     value_name = 'Value', var_name = 'Variable')
# melted_df_post = scaled_df.melt(value_vars = df_copy.columns,
#                     value_name = 'Value', var_name = 'Variable')
# melted_df_pre['Transform'] = 'No Transform'
# melted_df_post['Transform'] = 'yeo-johnson'
# melted_df = pd.concat([melted_df_pre, melted_df_post], ignore_index = True)


In [4]:
# #melted_df.head(n = 10)
# sns.set(rc = {'figure.figsize':(15,12)})
# v = sns.FacetGrid(melted_df, col='Variable', hue = 'Transform', height=2.5, col_wrap=5, sharex = False)
# v.map(sns.histplot, 'Value', alpha = 0.5).add_legend()
# v.tight_layout

## Filtering for Best Variables

Basing the approach off the notebook here: https://www.kaggle.com/code/ricopue/tps-jul22-clusters-and-lgb

we're going to take a subset of our factors to use for model training.




In [5]:
# sns.set(rc = {'figure.figsize':(15,12)})
# sns.set_style('white')
# heatmap = sns.heatmap(scaled_df.corr(), annot=False, cmap='BrBG',)
# heatmap.set_title('Variable Correlation', fontdict={'fontsize':26}, pad=16);
scaled_df.head()

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28
0,-0.38923,-0.917652,0.647948,0.590717,-0.824836,0.734685,2.25947,-0.977987,1.383372,1.039938,...,-0.472922,-0.753925,-0.76311,-0.707876,0.911477,-0.678852,0.768543,0.960344,1.042536,0.694234
1,-0.688368,-0.458647,0.653182,0.995359,-1.64403,0.864898,-0.085604,-0.977987,-0.875405,-0.179925,...,-0.423594,-0.088164,-1.777545,-0.535582,0.453824,1.031505,-0.117686,-0.550783,0.367242,-1.636652
2,0.805709,0.319397,-1.166935,-0.622421,0.108371,0.785018,1.990489,0.021718,1.017648,-0.394246,...,-0.408425,-1.598612,1.194423,2.203065,0.086974,-1.519163,-0.568662,0.9789,-0.926277,-2.296373
3,-0.500469,0.223997,0.262677,0.234061,0.417047,-1.218768,0.144455,0.286548,-1.213526,0.917564,...,0.620278,1.283827,0.532884,0.731623,-1.218086,0.826492,-1.173592,-0.395085,-0.100021,0.326682
4,-0.670427,-1.044482,-0.270854,-1.833338,-0.285955,-1.849243,0.787627,0.7569,0.187543,-0.394246,...,-1.614933,-0.432406,0.321899,0.228337,-1.482684,0.847999,-0.613935,1.164389,-0.374203,-1.160058


In [6]:
best_data =['f_00', 'f_01', 'f_02', 'f_03', 'f_04', 'f_05', 'f_06', 'f_22','f_23', 'f_24', 'f_25','f_26','f_27', 'f_28']
scaled_df = scaled_df[best_data]

## Parameter Tuning

The first parameter we need to identify is the number of clusters to predict. We'll do this by taking a subset of the data(to reduce training time) and training a model for between 2-15 clusters. 

To compare model performance we'll use the Silhouette score explained here: https://en.wikipedia.org/wiki/Silhouette_(clustering). 

Given that increasing the number of groups will naturally lead to a lower silhouette score we'll use the elbow method explained here: https://en.wikipedia.org/wiki/Elbow_method_(clustering) to look at when the rate of change reduces as we increase the number of clusters.

The best leaderboard score was a result of using n = 7 clusters

In [7]:
# sample_df = scaled_df.sample(n = 5000)
# clusters = range(2,15)
# scores = []

# for i in clusters:
#     gm = GaussianMixture(n_components=i, n_init=5, init_params='kmeans',
#                         verbose = 0)
#     gm_prediction = gm.fit_predict(sample_df)
#     # Calculate Silhoutte Score and append to a list
#     score = metrics.silhouette_score(sample_df, gm_prediction, metric='euclidean')
#     scores.append(score)
#     print('Number of Clusters: ', i, ' Score: ', score)
  

# plt.plot(clusters, scores, 'bo-')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Score')
# plt.title('Silhouette Score by Cluster Count')
# plt.show()

## Training Full Models
Using the identified number of clustes we'll train 3 models, a BayessianGaussianMixture, GaussianMixture, and MiniBatchKmeans.

In [8]:
!pip install scikit-learn-extra


Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.2.0-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.2.0
[0m

In [9]:
from sklearn_extra.cluster import KMedoids


In [None]:


# model_1 = BayesianGaussianMixture(n_components=6, n_init=5, verbose = 0.5,tol = 0.0001, max_iter = 200).fit(scaled_df)
# model_2 = GaussianMixture(n_components=6, n_init=5, verbose = 0.5,tol = 0.0001, max_iter = 200).fit(scaled_df)
# model_3 = MiniBatchKMeans(n_clusters=6).fit(scaled_df)
model_4 = KMedoids(n_clusters=6, max_iter=2, method='pam', random_state=42).fit(scaled_df)

# score_1 = metrics.silhouette_score(scaled_df, model_1.predict(scaled_df), metric='euclidean')
score_4 = metrics.silhouette_score(scaled_df, model_4.predict(scaled_df), metric='euclidean')

# print(score_1)
print(score_4)

## Ensembling Results

In [None]:
#Storing models
predictions = pd.DataFrame(np.array([model_1.predict(scaled_df),
                           model_4.predict(scaled_df)]).T, columns = ['BGM', 'KMedoids'])
predictions.head(n=10)
print(int(round(score_3*100, 0)))
test = predictions['KMedoids']
print(type(test))
print(len(predictions['KMedoids'] ))


agg_prediction = []
for index, row in predictions.iterrows():
    #print(row['c1'], row['c2'])
    agg_prediction.append(mode([row['BGM']]* int(round((score_1**-1)*100,0)) + 
                               [row['KMedoids']] * int(round((score_3**-1)*100, 0))))
predictions['Weighted_Pred'] = agg_prediction
predictions.head(n=10)
    
    


## Visualizing Results

Using principle component analysis we can reduce our dataset to 2 dimensions and visualize the clustering below. Unfortunately we cant capture all the variance of the dataset in two dimensions as shown by plotting the explained variance of each principle component so our 2d clustering visualization isnt perfect.

In [None]:
prediction = predictions['Weighted_Pred']
score = metrics.silhouette_score(scaled_df, prediction, metric='euclidean')
pca = PCA()
pca.fit_transform(scaled_df)
pca2 = PCA(n_components=2)
pca2.fit(scaled_df)

#Visualizing variance explanation of each principle component
variance = pca.explained_variance_
plt.figure(figsize=(8, 6))
plt.bar(range(len(variance)), variance, alpha=0.5, align='center', label='individual variance')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show()

#Visualizing clustering
sns.set(rc = {'figure.figsize':(8,6)})
scaled_pca = pd.DataFrame(pca2.transform(scaled_df), columns = ['PCA1', 'PCA2'])
scaled_pca['Group Prediction'] = prediction
sns.scatterplot(data = scaled_pca, x = 'PCA1', y = 'PCA2', hue = 'Group Prediction').set(title = 'Clustering Group Visualization with PCA')


## Making Prediction and Writing to File

In [None]:
labels = df['id']
submission = pd.DataFrame(np.array([labels, prediction]).T,
                                 columns = ['Id', 'Predicted'])
submission.to_csv('ensemble.csv', index=False)
