In [312]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
import sklearn.feature_selection

from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler


import sklearn.metrics as sm
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report

npf_train = pd.read_csv("data/npf_train.csv")
npf_test = pd.read_csv("data/npf_test_hidden.csv")

The function below is just for preprosessing the data. It drops date,id,partlybad columns and std's

In [313]:
def preprosessing(npf,scaler):
    '''Preprosessing function for npf_*.csv files'''

    # Dropping features 'partlybad','id' and 'date' because we won't need them. Feature 'partlybad' was only False 
    X = npf.drop(['date','id','partlybad','class4'],axis=1)
    

    #Dropping std's
    X_means = X.drop([c for c in npf.columns if 'std' in c],axis=1)

    X_means['class4'] = npf['class4']
    return X_means

In [314]:
def best_feature_columns(X_train, y_train, n):
    '''Return n best feature columns'''
    select = sklearn.feature_selection.SelectKBest(k=n)
    selected_features = select.fit(X_train, y_train)
    indices_selected = selected_features.get_support(indices=True)
    colnames_selected = [X_train.columns[i] for i in indices_selected]

    return colnames_selected

Loading the data and doing some preprosessing.

In [336]:
npf = preprosessing(npf_train,scale)
X_npf = npf.drop('class4',axis=1)
#y = npf['class4']

# Changing the categorical values to integers
y = npf['class4'].astype('category').cat.codes

Scaling the data for zero mean and unit variance

In [337]:
    # Removing .means from all column names
    cols = [col[:-5] for col in X_npf.columns]

    # Normalizing for zero mean and unit variance
    X_np = scale(X_npf)

    X = pd.DataFrame(X_np, columns=cols)

Selecting 20 best features

In [338]:
features = best_feature_columns(X,y,20)
#features = X.columns

Now doing the clustering. Fist using K-Means clustering. We set 4 clusters as parameter because we know that we have 4 classes to distinguish

In [352]:
kmeans_cluster = KMeans(4,n_init=50,random_state=3)
kmeans_cluster.fit(X)
#kmeans_cluster = KMeans(n_clusters=4, n_init=50,random_state=42).fit(X[features])


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=3, tol=0.0001, verbose=0)

In [353]:
acc = accuracy_score(y, kmeans_cluster.labels_)
acc

0.16279069767441862

Hmm quite bad accuracy. The labels are probably permuted. Function find_permutation find the right permutation to assing the correct labels

In [348]:
import scipy
def find_permutation(n_clusters, real_labels, labels):
    permutation = []
    for i in range(n_clusters):
        idx = labels == i
        new_label=scipy.stats.mode(real_labels[idx])[0][0]  # Choose the most common label among data points in the cluster
        permutation.append(new_label)
    return permutation

So here is the correct permutation for the cluster labels

In [354]:
permutation = find_permutation(4,y,kmeans_cluster.labels_)
print(permutation)

[0, 3, 3, 3]


Frankly it distinguishes only labels 0 and 3  
0 = II  
1 = Ia  
2 = Ib  
3 = nonevent

Checking how the clustering events are distributed

In [350]:
from collections import Counter
print(Counter(y).keys())
print(Counter(y).values())


dict_keys([3, 2, 0, 1])
dict_values([215, 83, 106, 26])


Okay so event and nonevents are evenly distribute 215 / 215 = 50/50  
0 = II = 24.7 %  
1 = Ia = 06.0 %  
2 = Ib = 19.3 %  
3 = nonevent = 50.0 %

But anyways now we have the new accuracy score

In [351]:
new_labels = [ permutation[label] for label in kmeans_cluster.labels_]   # permute the labels
print("Accuracy score is", accuracy_score(y, new_labels))


Accuracy score is 0.6069767441860465


Okay so this is already a lot better. Let's do a confusion matrix to double check the results


In [345]:
df = pd.DataFrame({'Labels': y, 'Clusters':kmeans_cluster.labels_})
ct = pd.crosstab(df['Labels'],df['Clusters'])
ct

Clusters,0,1,2,3
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,35,13,58,0
1,3,7,16,0
2,22,6,55,0
3,89,111,12,3


I'm not getting much out of this confusion matrix

In [230]:
from sklearn.metrics import classification_report
print(classification_report(y,new_labels))

              precision    recall  f1-score   support

           0       0.41      0.55      0.47       106
           1       0.00      0.00      0.00        26
           2       0.00      0.00      0.00        83
           3       0.70      0.94      0.81       215

    accuracy                           0.61       430
   macro avg       0.28      0.37      0.32       430
weighted avg       0.45      0.61      0.52       430



Even the classification report doesn't promise good results. Maybe trying different scaling or not scaling at all would help. Let's try MinMax, Standard and no scaling at all

In [355]:
X_minmax = pd.DataFrame(MinMaxScaler().fit_transform(X_npf), columns=X_npf.columns)
X_standard = pd.DataFrame(StandardScaler().fit_transform(X_npf), columns=X_npf.columns)
X_noscaling = X_npf

Feature selection for different scalers

In [356]:
minmax_features = best_feature_columns(X_minmax,y,20)
standard_features = best_feature_columns(X_standard,y,20)
noscaling_features = best_feature_columns(X_noscaling,y,20)

In [357]:
# Minmax scaling result

cluster = KMeans(4,n_init=50,random_state=42)
minmax_cluster = cluster.fit(X_minmax[minmax_features])
permutation = find_permutation(4,y,minmax_cluster.labels_)
new_labels = [ permutation[label] for label in minmax_cluster.labels_] 
print("Accuracy score is for MinMax scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for MinMax scaling 0.5651162790697675


[0, 0, 3, 3]

In [358]:
# Standard scaling result

cluster = KMeans(4,n_init=50,random_state=42)
stamdard_cluster = cluster.fit(X_standard[standard_features])
permutation = find_permutation(4,y,stamdard_cluster.labels_)
new_labels = [ permutation[label] for label in stamdard_cluster.labels_] 
print("Accuracy score is for Standard scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for Standard scaling 0.586046511627907


[2, 3, 0, 3]

In [359]:
# No scaling result

cluster = KMeans(4,n_init=50,random_state=42)
nofeatures_cluster = cluster.fit(X_noscaling[noscaling_features])
permutation = find_permutation(4,y,nofeatures_cluster.labels_)
new_labels = [ permutation[label] for label in nofeatures_cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5790697674418605


[2, 3, 0, 3]

Well this is kind of odd? Class Ia is totally missing from the results? Let's try hieragical clustering

In [306]:
from sklearn.cluster import AgglomerativeClustering

# Zero mean and unit variance scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X[features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5720930232558139


[3, 2, 3, 0]

Trying the different scalers here too (MinMax, Standard and no scaling)

In [307]:
# Minmax scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X_minmax[minmax_features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for minmax scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.586046511627907


[3, 0, 0, 2]

In [308]:
# Standard scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X_standard[standard_features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for standard scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5720930232558139


[3, 2, 3, 0]

In [309]:
# No scaling
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean',linkage='ward')
data = X_noscaling[noscaling_features].values
cluster.fit(data)
permutation = find_permutation(4,y,cluster.labels_)
new_labels = [ permutation[label] for label in cluster.labels_] 
print("Accuracy score is for no scaling", accuracy_score(y, new_labels))
permutation

Accuracy score is for no scaling 0.5767441860465117


[3, 2, 3, 0]

The accurasy seems stuck in around 0.57. Maybe it is just that it can't be predicted so well. I'd go with the k-means clustering. 

Last, let's try different amount of features with k-means clustering. I will use StandardScaler as it got the best accuracy with most classes present. The following cell was to help trying to find the optimal amount of features with sklearns selectKBest() function. The cell returns the parameters from best accuracy score and prints the features that the model used

In [404]:
import random
random.seed(42)

best_score = 0
best_permu = []
n_features = 0
best_features = []
for i in range(1,len(X_noscaling.columns)):
    standard_features = best_feature_columns(X_standard,y,i)
    cluster = KMeans(4,n_init=50,random_state=42)
    standard_cluster = cluster.fit(X_standard[standard_features])
    permutation = find_permutation(4,y,standard_cluster.labels_)
    new_labels = [ permutation[label] for label in stamdard_cluster.labels_] 
    acc = accuracy_score(y, new_labels)

    if (acc > best_score):
        best_score = acc
        best_permu = permutation
        n_features = i
        best_features = standard_features

print("Best accuracy score is for {} features with acc of: {} the permutation being {}".format(n_features, best_score,best_permu))
print(best_features)

Best accuracy score is for 18 features with acc of: 0.586046511627907 the permutation being [2, 3, 0, 3]
['Glob.mean', 'NET.mean', 'PAR.mean', 'RGlob.mean', 'RHIRGA168.mean', 'RHIRGA336.mean', 'RHIRGA42.mean', 'RHIRGA504.mean', 'RHIRGA672.mean', 'RHIRGA84.mean', 'UV_A.mean', 'UV_B.mean']


0.586046511627907

I tried here to find a model which gets the best accuracy with 12 features as Lauri said that he used the same number of features. Didn't get any better result as is was still stuck to 0.58

In [405]:
n_features = 12
standard_features = best_feature_columns(X_standard,y,n_features)
cluster = KMeans(4,n_init=50,random_state=18)
standard_cluster = cluster.fit(X_standard[standard_features])
permutation = find_permutation(4,y,standard_cluster.labels_)
new_labels = [ permutation[label] for label in stamdard_cluster.labels_] 
acc = accuracy_score(y, new_labels)
print("ccuracy score is for {} features with acc of: {} the permutation being {}".format(n_features, acc,permutation))
standard_features

ccuracy score is for 12 features with acc of: 0.586046511627907 the permutation being [2, 3, 0, 3]


['Glob.mean',
 'NET.mean',
 'PAR.mean',
 'RGlob.mean',
 'RHIRGA168.mean',
 'RHIRGA336.mean',
 'RHIRGA42.mean',
 'RHIRGA504.mean',
 'RHIRGA672.mean',
 'RHIRGA84.mean',
 'UV_A.mean',
 'UV_B.mean']

# Now trying the supervised learning methods  

First logistic regression

Train and test sets are divided in 75/25 proportions automatically

In [425]:
# X being X_standard which is just standard scaled X values
y = npf['class4']

# Splitting the data

X_train, X_test, y_train, y_test = train_test_split(X_standard, y, random_state=42)

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.71
Accuracy of Logistic regression classifier on test set: 0.70


Trying the same with best 12 features. The features below were the best features which were given from the selectKBest() function

In [413]:
features = ['Glob.mean',
 'NET.mean',
 'PAR.mean',
 'RGlob.mean',
 'RHIRGA168.mean',
 'RHIRGA336.mean',
 'RHIRGA42.mean',
 'RHIRGA504.mean',
 'RHIRGA672.mean',
 'RHIRGA84.mean',
 'UV_A.mean',
 'UV_B.mean']

In [424]:


X_train, X_test, y_train, y_test = train_test_split(X_standard[features], y, random_state=42)

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

Accuracy of Logistic regression classifier on training set: 0.62
Accuracy of Logistic regression classifier on test set: 0.66


Oh, worse score. Maybe well just include all features. Let's try Decission Tree next

In [416]:
# Decission tree

from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X_standard, y, random_state=42)

clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.64


In [417]:
# K neighbour classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.75
Accuracy of K-NN classifier on test set: 0.67


In [418]:
# Linear Discriminant Analysis

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(X_test, y_test)))

Accuracy of LDA classifier on training set: 0.76
Accuracy of LDA classifier on test set: 0.64


In [419]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.56
Accuracy of GNB classifier on test set: 0.55


In [426]:
# SVM

from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.73
Accuracy of SVM classifier on test set: 0.70


# SVM and logistic regression gave the best accuracies

In [429]:
# This scores are for logreg

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = logreg.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[17  0  1  5]
 [ 3  0  0  1]
 [13  3  5  4]
 [ 1  0  1 54]]
              precision    recall  f1-score   support

          II       0.50      0.74      0.60        23
          Ia       0.00      0.00      0.00         4
          Ib       0.71      0.20      0.31        25
    nonevent       0.84      0.96      0.90        56

    accuracy                           0.70       108
   macro avg       0.51      0.48      0.45       108
weighted avg       0.71      0.70      0.67       108



In [430]:
# This scores are for SVM

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[15  0  4  4]
 [ 2  0  0  2]
 [16  0  6  3]
 [ 1  0  0 55]]
              precision    recall  f1-score   support

          II       0.44      0.65      0.53        23
          Ia       0.00      0.00      0.00         4
          Ib       0.60      0.24      0.34        25
    nonevent       0.86      0.98      0.92        56

    accuracy                           0.70       108
   macro avg       0.48      0.47      0.45       108
weighted avg       0.68      0.70      0.67       108



# Thought process

Data preprocessing  
    - Dropped std's  
    - Used various different scaling types to see if there is a difference

In every machine learning method 4 clusters was selected as paremeter because we know that we have 4 different event to distinguish

## Unsupervised learning methods  

    - K-means  
        * Changed categorical labels to integers  
        * Scaled data to zero mean and unit variance
        * Selected 20 best feature with skleanr selectKBest()  
        * Score after permutation was 0.60, but only events of II and nonevent was distinguished  
        * Next I tried to do the same with different scaling methods  
            - MinMaxScaler, StandardScaler and no scaling at all  
        MinMax:     acc = 0.565  
        Standard:   acc = 0.586
        NoScaling:  acc = 0.576  

    - Hierachical clustering  
        * I performed the same test with AgglomerativeClustering to see if this unsupervised method performs better

        Scale:      acc = 0.572
        MinMax:     acc = 0.586
        Standard:   acc = 0.572
        NoScaling:  acc = 0.576

        * So no development here
        * Lastly I tried k-means with StandardScaler, as it had the best accuracy while most of the classes were present, and different amount of features.  

        There was no clear trend but with the binay classifier 12 features were the best to I tried it also and the result was not significantly better.  


## Supervised learning methods

Same preprosessing of the data except I didn't categorizise the labels and used only Standard scaler. Tested just a bunch of different supervised learning methods. Started with Logistic regression  

    - Logistic regression  
        * Data splitted in 75/25 ratio to train and test
        * Acc score = 0.70 on test set
        * Tried to reduce the amount of features to 12 best but it lowered the accuracy  
            - Acc score = 0.66 (12 features)
        * Decided to keep all the features along
    - Next just test different methods (all acc scores on test set)

        Decission there                 acc = 0.64
        K-NN                            acc = 0.67
        Linear discriminant analysis    acc = 0.64
        Gaussia NB                      acc = 0.55
        SVM                             acc = 0.70  

    - LogReg and SVM turned out to be the best from all supervised and unsupervised learning methods, but I chose LogReg to be the best. That is because it's weighted average and the recall values turned out to be a little better than SVM's



    
