# Code for testing using KMeans clusters as a new feature (Feature generation)


## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [2]:
# !kaggle competitions download -c playground-series-s3e22
# !unzip -o playground-series-s3e22.zip
# !kaggle datasets download yasserh/horse-survival-dataset
# !unzip -o horse-survival-dataset.zip
# !rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [3]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_SEED = 42

## Clusterers

In [4]:
class BaseClusterer:
  X_columns = None
  n_clusters = None
  
  def __init__(self):
    pass

  def fit(self, X_train):
    raise NotImplementedError()

  def get_cluster_numbers(self, X):
    raise NotImplementedError()

In [5]:
import hdbscan

class HDBSCANClusterer(BaseClusterer):
  def __init__(self):
    self.hdbscan = None
    self.X_columns = None

  def fit(self, X_train):
    # Initialize X_columns
    self.X_columns = X_train.columns
    
    # Initialize the KNN model
    self.hdbscan = hdbscan.HDBSCAN(
      cluster_selection_method='leaf', 
      min_cluster_size=10,
      prediction_data=True
    )

    # Train the KNN model
    self.hdbscan.fit(X_train)

    # Update number of clusters
    self.n_clusters = self.hdbscan.labels_.max() + 1

  def get_cluster_numbers(self, X):
    if not self.hdbscan:
      raise Exception("HDBSCANClusterer not initialised!")
    
    cluster_numbers = hdbscan.approximate_predict(self.hdbscan, X)[0]
    return cluster_numbers

In [6]:
from sklearn.cluster import KMeans

class KMeansClusterer(BaseClusterer):
  def __init__(self, n_clusters=8):
    self.kmeans = None
    self.X_columns = None
    self.n_clusters = n_clusters

  def fit(self, X_train):
    # Initialize X_columns
    self.X_columns = X_train.columns
    
    # Initialize the KNN model
    self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=_SEED)

    # Train the KNN model
    self.kmeans.fit(X_train)

  def get_cluster_numbers(self, X):
    if not self.kmeans:
      raise Exception("KMeansClusterer not initialised!")
    
    cluster_numbers = self.kmeans.predict(X)
    return cluster_numbers

## Data Preprocessing

In [7]:
import pandas as pd
from imblearn.over_sampling import SMOTE


def preprocess_data(df, clusterer:BaseClusterer=None, train=True):
    # Separate features and target
    if train:
        cols_to_drop = ["outcome", "id"]
    else:
        cols_to_drop = ["id"]

    # Simple handling of NA values: drop rows with missing values
    if train:
        df = df.dropna()

    X = df.drop(columns=cols_to_drop)

    if "outcome" in df.columns:
        y = df["outcome"]
    else:
        y = None
    
    # One hot encoding
    X = pd.get_dummies(X)

    if not train:
        # Reindex test columns to include all categorical features encoded during training
        X = X.reindex(columns=clusterer.X_columns, fill_value=0)

    # Only Use Important Features
    selected_features = [
        'rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
        'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
        'surgery_no', 'surgery_yes', 'temp_of_extremities_cool',
        'peripheral_pulse_reduced', 'capillary_refill_time_more_3_sec',
        'pain_depressed', 'pain_mild_pain', 'pain_severe_pain', 'peristalsis_absent',
        'abdominal_distention_moderate', 'nasogastric_reflux_more_1_liter',
        'rectal_exam_feces_absent', 'abdomen_distend_large',
        'abdomo_appearance_serosanguious', 'surgical_lesion_no',
        'surgical_lesion_yes', 'cp_data_no', 'mucous_membrane_normal_pink',
        'abdomo_appearance_cloudy', 'capillary_refill_time_less_3_sec',
        'peripheral_pulse_normal', 'nasogastric_tube_slight',
        'mucous_membrane_pale_pink', 'pain_extreme_pain',
        'mucous_membrane_pale_cyanotic', 'abdomen_distend_small', 'cp_data_yes',
        'abdominal_distention_slight', 'temp_of_extremities_normal',
        'mucous_membrane_bright_red', 'abdominal_distention_severe',
        'abdomo_appearance_clear', 'rectal_exam_feces_decreased',
        'peristalsis_hypomotile', 'age_young', 'nasogastric_reflux_less_1_liter',
        'rectal_exam_feces_normal', 'temp_of_extremities_cold', 'abdomen_firm',
        'pain_alert', 'nasogastric_tube_significant',
        'mucous_membrane_dark_cyanotic', 'peristalsis_normal', 'abdomen_normal',
        'mucous_membrane_bright_pink', 'age_adult', 'peripheral_pulse_absent',
        'rectal_exam_feces_increased'
    ]
    X = X[selected_features]

    # SMOTE Oversampling of minority classes (During training stage)
    if train:
        smote = SMOTE(random_state=_SEED)
        X, y = smote.fit_resample(X, y)

    # Generate Cluster Numbers
    if clusterer:
        if train:
            clusterer.fit(X)
        cluster_numbers = clusterer.get_cluster_numbers(X)
        X["cluster_number"] = cluster_numbers

    return X, y



## Experiment

In [8]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score


def run(df: pd.DataFrame, experiment_name: str, clusterer:BaseClusterer=None):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(df, clusterer)

    forest = RandomForestClassifier(random_state=_SEED)

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(forest, X, y, cv=k_folds, scoring="f1_micro")
    f1_score_micro_avg = scores.mean()

    print("F1 Score (Micro-Averaged):", f1_score_micro_avg)

    return f1_score_micro_avg

In [10]:
# To store experiment results
result_f1_scores = []
result_clusterers = []
result_n_clusters = []

In [11]:
#############################################
############### No Clustering ###############
#############################################

base_f1_score_micro_avg = run(pd.concat((train_df, original_df), axis=0), "Without KMeans Clusters (0 clusters)")

# Save results to array
result_f1_scores.append(base_f1_score_micro_avg)
result_clusterers.append("-")
result_n_clusters.append(0)

F1 Score (Micro-Averaged): 0.8090895341802783


In [12]:
#################################################
############### HDBSCAN Custering ###############
#################################################

hdbscan_clusterer = HDBSCANClusterer()

f1_score_micro_avg = run(
  pd.concat((train_df, original_df), axis=0), 
  f"With HDBSCAN Clusters", 
  clusterer=hdbscan_clusterer
)

# Save results to array
result_f1_scores.append(f1_score_micro_avg)
result_clusterers.append("HDBSCAN")
result_n_clusters.append(hdbscan_clusterer.n_clusters)

F1 Score (Micro-Averaged): 0.8177858439201451


In [13]:
#################################################
############### KMeans Clustering ###############
#################################################

N_CLUSTERS = [
  3, 6, 8, 12, 16, 24
]

for n_clusters in N_CLUSTERS:
  f1_score_micro_avg = run(
    pd.concat((train_df, original_df), axis=0), 
    f"With KMeans Clusters ({n_clusters} clusters)", 
    clusterer=KMeansClusterer(n_clusters=n_clusters)
  )

  # Save results to array
  result_f1_scores.append(f1_score_micro_avg)
  result_clusterers.append("KMeans")
  result_n_clusters.append(n_clusters)

F1 Score (Micro-Averaged): 0.8142468239564428
F1 Score (Micro-Averaged): 0.8073805202661827
F1 Score (Micro-Averaged): 0.813475499092559
F1 Score (Micro-Averaged): 0.8091349062310951
F1 Score (Micro-Averaged): 0.8091802782819117
F1 Score (Micro-Averaged): 0.8081820931639443


In [14]:
pd.DataFrame({
  "clusterer": result_clusterers,
  "n_clusters": result_n_clusters,
  "f1_score_micro_avg": result_f1_scores
}).sort_values(by="f1_score_micro_avg", ascending=False).reset_index(drop=True)

Unnamed: 0,clusterer,n_clusters,f1_score_micro_avg
0,HDBSCAN,21,0.817786
1,KMeans,3,0.814247
2,KMeans,8,0.813475
3,KMeans,16,0.80918
4,KMeans,12,0.809135
5,-,0,0.80909
6,KMeans,24,0.808182
7,KMeans,6,0.807381


# Submit

In [15]:
clusterer = HDBSCANClusterer()

X, y = preprocess_data(pd.concat((train_df, original_df), axis=0), clusterer=clusterer)
forest = RandomForestClassifier(random_state=_SEED)
forest.fit(X, y)

X_submit, _ = preprocess_data(test_df, clusterer=clusterer, train=False)
X_submit = X_submit.reindex(columns=X.columns, fill_value=0)

y_pred_submit = forest.predict(X_submit)

In [16]:
save_df = pd.DataFrame({"id": test_df["id"], "outcome": y_pred_submit})
save_df.to_csv("submission.csv", index=False, header=True)

In [17]:
# !kaggle competitions submit -c playground-series-s3e22 -f submission.csv -m ""