[Notebook of task](https://github.com/DistributedSystemsGroup/Algorithmic-Machine-Learning/blob/master/Challenges/Plankton/plankton_challenge.ipynb)


# Classification of Plankton based on features 


In [1]:
%matplotlib inline
# For configuration and jupiter
import os
import sys
import re
import random
import matplotlib
import implicit
import warnings
# For data manipulation
import pandas as pd
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
# For prediction
from tqdm import tqdm

basepath = "/mnt/datasets/plankton/flowcam/"

In [2]:
nativeDF = pd.read_csv(basepath + 'features_native.csv.gz')
skimagDF = pd.read_csv(basepath + 'features_skimage.csv.gz')
labelsDF = pd.read_csv(basepath + 'meta.csv')

In [3]:
#print(nativeDF.isnull().sum().sort_values(ascending=False)[:10])
#print(skimagDF.isnull().sum().sort_values(ascending=False)[:6])

native_nan_cols = ['perimareaexc', 'feretareaexc', 'cdexc', 'skeleton_area',
                   'nb1_area', 'symetrieh_area', 'symetriev_area', 'convarea_area',
                  'nb2_area', 'nb3_area', ]
skimage_nan_cols = ['moments_normalized4', 'weighted_moments_normalized0', 
                    'moments_normalized1', 'moments_normalized0',
                   'weighted_moments_normalized1', 'weighted_moments_normalized4']

In [4]:
for col_name in native_nan_cols:
    nativeDF[col_name] = nativeDF[col_name].fillna(0)

for col_name in skimage_nan_cols:
    skimagDF[col_name] = skimagDF[col_name].fillna(0)

labelsDF['objid'] = labelsDF['objid'].astype(np.int64, errors='ignore')
labelsDF['level1'] = labelsDF['level1'].fillna('No_level1_name')
labelsDF['level2'] = labelsDF['level2'].fillna('No_level2_name')

print(nativeDF.isnull().sum().any())
print(skimagDF.isnull().sum().any())
print(labelsDF.isnull().sum().any())

False
False
False


In [10]:
y = labelsDF['level2'] 
X = nativeDF

In [11]:
from sklearn.model_selection import train_test_split

# Divide train data into train and validation sets

seed = 1
test_size = 0.20

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = test_size, random_state = seed)

In [7]:
# PCA performs best with a normalized feature set 
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  

In [8]:
from sklearn.decomposition import PCA

# 65 attributes in NativeDF
# 30 elements: 99,0%, 10 elements 83,4%, 20 elements: 96,1%

pca = PCA(.99)  
X_train = pca.fit_transform(X_train)  
X_test = pca.transform(X_test)  

In [12]:
from sklearn.metrics import accuracy_score, log_loss, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    MLPClassifier(alpha=1),
    #GradientBoostingClassifier()
    ]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    f1 = f1_score(y_test, train_predictions, average='macro', labels=np.unique(train_predictions))
    print("F1 Score: {:.4%}".format(f1))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 52.6107%
F1 Score: 10.3826%
Log Loss: 10.301894631505432
DecisionTreeClassifier
****Results****
Accuracy: 74.6521%
F1 Score: 35.0273%
Log Loss: 8.754851781067053
RandomForestClassifier
****Results****
Accuracy: 81.5340%
F1 Score: 45.3039%
Log Loss: 1.8496471134880161
AdaBoostClassifier
****Results****
Accuracy: 57.3950%
F1 Score: 29.5030%
Log Loss: 3.5269687553049835
MLPClassifier
****Results****
Accuracy: 57.0646%
F1 Score: 20.3746%
Log Loss: 14.827862083519669


In [None]:
# without PCA, RandomForestClassifer is best and KNeighbours and MLP is awful. With PCA, MLP and KNeighbours perform similar to DecisionTree and RandomForest

model = RandomForestClassifier()
model.fit(X_train, y_train)
model_predictions = model.predict(X_test)
