In [200]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
import sklearn.feature_selection

from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


import sklearn.metrics as sm
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report

npf_train = pd.read_csv("data/npf_train.csv")
npf_test = pd.read_csv("data/npf_test_hidden.csv")

In [210]:
def preprosessing(npf,scaler):
    '''Preprosessing function for npf_*.csv files'''

    # Dropping features 'partlybad','id' and 'date' because we won't need them. Feature 'partlybad' was only False 
    X = npf.drop(['date','id','partlybad','class4'],axis=1)
    

    #Dropping std's
    X_means = X.drop([c for c in npf.columns if 'std' in c],axis=1)


    # Removing .means from all column names
    cols = [col[:-5] for col in X_means.columns]

    # Normalizing for zero mean and unit variance
    X_means_np = scaler(X_means)

    #df = pd.DataFrame(X_means_np, columns=cols)
    #df["class4"] = npf["class4"]

    #return df
    X_means['class4'] = npf['class4']
    return X_means

In [136]:
def best_feature_columns(X_train, y_train, n):
    '''Return n best feature columns'''
    select = sklearn.feature_selection.SelectKBest(k=n)
    selected_features = select.fit(X_train, y_train)
    indices_selected = selected_features.get_support(indices=True)
    colnames_selected = [X_train.columns[i] for i in indices_selected]

    return colnames_selected

Loading the data and doing some preprosessing.

In [218]:
npf = preprosessing(npf_train,scale)
X_npf = npf.drop('class4',axis=1)
#y1 = npf['class4']

# Changing the categorical values to integers
y = npf['class4'].astype('category').cat.codes

Scaling the data for zero mean and unit variance

In [222]:
    # Removing .means from all column names
    cols = [col[:-5] for col in X_npf.columns]

    # Normalizing for zero mean and unit variance
    X_np = scale(X_npf)

    X = pd.DataFrame(X_np, columns=cols)

Selecting 20 best features

In [223]:
features = best_feature_columns(X,y,20)
#features = X.columns

Now doing the clustering. Fist using K-Means clustering. We set 4 clusters as parameter because we know that we have 4 classes to distinguish

In [224]:
kmeans_cluster = KMeans(4,n_init=50,random_state=42)
kmeans_cluster.fit(X)
#kmeans_cluster = KMeans(n_clusters=4, n_init=50,random_state=42).fit(X[features])


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [225]:
acc = accuracy_score(y, kmeans_cluster.labels_)
acc

0.23255813953488372

Hmm quite bad accuracy. The labels are probably permuted

In [226]:
import scipy
def find_permutation(n_clusters, real_labels, labels):
    permutation = []
    for i in range(n_clusters):
        idx = labels == i
        new_label=scipy.stats.mode(real_labels[idx])[0][0]  # Choose the most common label among data points in the cluster
        permutation.append(new_label)
    return permutation

In [227]:
permutation = find_permutation(4,y,kmeans_cluster.labels_)
print(permutation)

[3, 3, 0, 3]


In [228]:
from collections import Counter
print(Counter(y).keys())
print(Counter(y).values())


dict_keys([3, 2, 0, 1])
dict_values([215, 83, 106, 26])


Hmm why it misses value 1?

In [229]:
new_labels = [ permutation[label] for label in kmeans_cluster.labels_]   # permute the labels
print("Accuracy score is", accuracy_score(y, new_labels))


Accuracy score is 0.6069767441860465


In [230]:
from sklearn.metrics import classification_report
print(classification_report(y,new_labels))

              precision    recall  f1-score   support

           0       0.41      0.55      0.47       106
           1       0.00      0.00      0.00        26
           2       0.00      0.00      0.00        83
           3       0.70      0.94      0.81       215

    accuracy                           0.61       430
   macro avg       0.28      0.37      0.32       430
weighted avg       0.45      0.61      0.52       430



Maybe trying different scaling or not scaling at all would help

In [238]:
X_minmax = pd.DataFrame(MinMaxScaler().fit_transform(X_npf), columns=X_npf.columns)
X_standard = pd.DataFrame(StandardScaler().fit_transform(X_npf), columns=X_npf.columns)
X_noscaling = X_npf

Feature selection

In [258]:
minmax_features = best_feature_columns(X_minmax,y,20)
standard_features = best_feature_columns(X_standard,y,20)
noscaling_features = best_feature_columns(X_noscaling,y,20)

In [265]:
# Minmax scaling result

cluster = KMeans(4,n_init=50,random_state=42)
minmax_cluster = cluster.fit(X_minmax[minmax_features])
permutation = find_permutation(4,y,minmax_cluster.labels_)
new_labels = [ permutation[label] for label in minmax_cluster.labels_] 
print("Accuracy score is for MinMax scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for MinMax scaling 0.5651162790697675


[0, 0, 3, 3]

In [266]:
# Standard scaling result

cluster = KMeans(4,n_init=50,random_state=42)
stamdard_cluster = cluster.fit(X_standard[standard_features])
permutation = find_permutation(4,y,stamdard_cluster.labels_)
new_labels = [ permutation[label] for label in stamdard_cluster.labels_] 
print("Accuracy score is for Standard scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for Standard scaling 0.586046511627907


[2, 3, 0, 3]

In [267]:
# No scaling result

cluster = KMeans(4,n_init=50,random_state=42)
nofeatures_cluster = cluster.fit(X_noscaling[noscaling_features])
permutation = find_permutation(4,y,nofeatures_cluster.labels_)
new_labels = [ permutation[label] for label in nofeatures_cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5790697674418605


[2, 3, 0, 3]

Well this is kind of odd? Class Ia is totally missing from the results? Let's try hieragical clustering

In [306]:
from sklearn.cluster import AgglomerativeClustering

# Zero mean and unit variance scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X[features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5720930232558139


[3, 2, 3, 0]

In [307]:
# Minmax scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X_minmax[minmax_features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.586046511627907


[3, 0, 0, 2]

In [308]:
# Standard scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X_standard[standard_features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5720930232558139


[3, 2, 3, 0]

In [309]:
# No scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X_noscaling[noscaling_features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5767441860465117


[3, 2, 3, 0]

Maybe it is just that it can't be predicted so well. I'd go with the k-means clustering