# Multiclass Pipeline

#### Pipeline Parameters

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from statistics import mode
import seaborn as sns
import tqdm
import openpyxl

import sys 
import joblib
import pickle
import itertools

sys.path.append('H:/Documents/PhD/3rd-year-project/classify-mosquitoes/src/')
import extract, split, config

np.random.seed(0)

In [None]:
# Segment Size and Overlap (in seconds)
segment_size = 6
segment_overlap = 5.5

# Trials split between test/train and validation set
test_trials = np.array([2,3 ,6,7,8, 11,12, 15,16])
target_trials = np.array([0,0, 1,1,1, 2,2, 3,3])
hyp_trials = np.array([0,1, 4,5, 9,10, 13,14])

# Paths
results_path = config.PATH + 'tuned model/multiclass/' # Results stored
data_path = results_path + 'data/' # Any data 

#### Feature Extraction

In [None]:
tracks, trackTargets, tracksTrialId = extract.load(config.FILE, config.PATH, config.IS_RESISTANT, config.DATA_PATH)

with open(data_path + 'raw_tracks.npy', 'wb') as w:
    np.save(w, np.array(tracks, dtype=object))
with open(data_path + 'raw_trackTargets.npy', 'wb') as w:
    np.save(w, np.array(trackTargets, dtype=object))
with open(data_path + 'raw_tracksTrialId.npy', 'wb') as w:
    np.save(w, np.array(tracksTrialId, dtype=object))

In [None]:
tracks = np.load(data_path + 'raw_tracks.npy', allow_pickle=True)
trackTargets = np.load(data_path + 'raw_trackTargets.npy', allow_pickle=True)
tracksTrialId = np.load(data_path + 'raw_tracksTrialId.npy', allow_pickle=True)   

tracks = extract.generate_features(tracks, (0,1), 2)

with open(data_path + 'tracks_features.npy', 'wb') as w:
    np.save(w, tracks)

In [None]:
tracks = np.load(data_path + 'tracks_features.npy', allow_pickle=True)
bmodes = joblib.load('bmodess.dat')
track_id = 0
while track_id < len(tracks):
    mask = np.isin(tracks[track_id][:, 16], bmodes[track_id][: ,0])
    tracks[track_id] = np.insert(tracks[track_id], len(tracks[track_id][0]), mask, axis=1)
    track_id += 1

with open(data_path + 'tracks_features_gaps_marked.npy', 'wb') as w:
    np.save(w, np.array(tracks, dtype=object))

In [None]:
tracks = np.load(data_path + 'tracks_features_gaps_marked.npy', allow_pickle=True)
trackTargets = np.load(data_path + 'raw_trackTargets.npy', allow_pickle=True)
tracksTrialId = np.load(data_path + 'raw_tracksTrialId.npy', allow_pickle=True)  

tracks, trackTargets, tracksTrialId, trackGroup = split.split_tracks(tracks, trackTargets, tracksTrialId, segment_size, segment_overlap)


In [None]:

with open(data_path + 'tracks_split.npy', 'wb') as w:
    np.save(w, tracks)
with open(data_path + 'trackTargets_split.npy', 'wb') as w:
    np.save(w, trackTargets)
with open(data_path + 'trackGroup_split.npy', 'wb') as w:
    np.save(w, trackGroup)
with open(data_path + 'tracksTrialId_split.npy', 'wb') as w:
    np.save(w, tracksTrialId)

In [None]:
feature_columns = [
    'X Velocity',
    'Y Velocity',
    'X Acceleration', 
    'Y Acceleration',
    'Velocity',
    'Acceleration',
    'Jerk',
    'Angular Velocity',
    'Angular Acceleration',
    'Angle of Flight',
    'Centroid Distance Function',
    'Persistence Velocity',
    'Turning Velocity'
]   
indexes = [12,13,14,15,3,10,17,4,11,18,19,20,21]
feature_stats = ['mean','median','std', '1st quartile','3rd quartile','kurtosis', 'skewness','number of local minima','number of local maxima','number of zero-crossings']     

track_statistics = dict()

for col in feature_columns:
    for stat in feature_stats:
        track_statistics[f'{col} ({stat})'] = []

for track in tracks:
    data = extract.track_stats(track, indexes=indexes, columns=feature_columns)
    for d in data:
        track_statistics[d].append(data[d])

df = pd.DataFrame(data=track_statistics)
to_add = extract.add_other_features(tracks, (0,1))
df = pd.concat([df, to_add], axis=1)

df = df.join(pd.DataFrame({'TrialID': tracksTrialId}))

df_target = pd.DataFrame({'Target': trackTargets, 'TrialID': tracksTrialId, 'TrackGroup': trackGroup})

df.to_pickle(data_path + 'df.pkl')
df_target.to_pickle(data_path + 'df_target.pkl')

In [None]:
df = pd.read_pickle(data_path + 'df.pkl')
df_target = pd.read_pickle(data_path + 'df_target.pkl')
#tracks = np.load(data_path + 'tracks_split.npy', allow_pickle=True)

In [None]:
df_target.loc[df_target['TrialID'].isin([0,1,2,3]), 'Target'] = 0
df_target.loc[df_target['TrialID'].isin([4,5,6,7,8]), 'Target'] = 1
df_target.loc[df_target['TrialID'].isin([9,10,11,12]), 'Target'] = 2
df_target.loc[df_target['TrialID'].isin([13,14,15,16]), 'Target'] = 3
df_target

In [None]:
def penalty_function(segment, n, m):
    penalty_score = 0
    k = 0

    for position in segment:
        if position == 0:
            penalty_score += n * (m ** k)
            k += 1
        else:
            k = max(0, k-1)

    return penalty_score/len(segment)

scores = []
for segment in tracks:
    mask = segment[:, -1]
    scores.append(penalty_function(mask, n=1, m=1.05))
scores = np.array(scores)
joblib.dump(scores, data_path + 'scores.dat')

In [None]:
scores = joblib.load(data_path + 'scores.dat')
df = pd.read_pickle(data_path + 'df.pkl')
df_target = pd.read_pickle(data_path + 'df_target.pkl')

df_target.loc[df_target['TrialID'].isin([0,1,2,3]), 'Target'] = 0
df_target.loc[df_target['TrialID'].isin([4,5,6,7,8]), 'Target'] = 1
df_target.loc[df_target['TrialID'].isin([9,10,11,12]), 'Target'] = 2
df_target.loc[df_target['TrialID'].isin([13,14,15,16]), 'Target'] = 3

In [None]:
def run_score_threshold_mutual_info(df, df_target, scores):
    score_thresholds = np.linspace(0, max(scores), 250)
    max_mutual_info_dict = {}
    unique_features = []
    for threshold in tqdm.tqdm(score_thresholds):
        mask = np.where(scores <= threshold)[0]
        df_temp = df.iloc[mask]
        df_target_temp = df_target.iloc[mask]

        indexes = df_temp[df_temp.isna().any(axis=1)].index
        df_temp = df_temp.drop(index=indexes)
        df_target_temp = df_target_temp.drop(index=indexes)

        df_temp = extract.remove_nans(df_temp)        

        df_temp = df_temp.drop(columns=['TrialID'])
        df_target_temp = df_target_temp['Target']

        mutual_info_values = mutual_info_classif(df_temp, df_target_temp)

        unique_features += df_temp.columns.values.tolist()

        for feature, mutual_info_value in zip(df_temp.columns, mutual_info_values):
            if feature not in max_mutual_info_dict or max_mutual_info_dict[feature]['mutual_info'] < mutual_info_value:
                max_mutual_info_dict[feature] = {
                    'mutual_info': mutual_info_value,
                    'score_threshold': threshold
                }
             
            elif max_mutual_info_dict[feature]['mutual_info'] == mutual_info_value and max_mutual_info_dict[feature]['score_threshold'] < threshold:
                max_mutual_info_dict[feature] = {
                    'mutual_info': mutual_info_value,
                    'score_threshold': threshold
                }

    unique_features = list(set(unique_features))
    corresponding_values = [max_mutual_info_dict[feature]['score_threshold'] for feature in unique_features]

    max_values = [max_mutual_info_dict[feature]['mutual_info'] for feature in unique_features]
    exp = np.exp(np.array(max_values)/(np.array(corresponding_values)+1))
    weights = exp / np.sum(exp)

    weighted_average_threshold = np.average(corresponding_values, weights=weights)

    return weighted_average_threshold

In [None]:
score_threshold = run_score_threshold_mutual_info(
    df[df['TrialID'].isin(hyp_trials)],
    df_target[df_target['TrialID'].isin(hyp_trials)], 
    scores[df_target['TrialID'].isin(hyp_trials)])

In [None]:
mask = np.where(scores <= score_threshold)[0]
df = df.iloc[mask]
df_target = df_target.iloc[mask]

indexes = df[df.isna().any(axis=1)].index
df = df.drop(index=indexes)
df_target = df_target.drop(index=indexes)

df = extract.remove_nans(df)  

In [None]:
df.to_pickle(data_path + 'df_filtered.pkl')
df_target.to_pickle(data_path + 'df_target_filtered.pkl')

#### Split Train-Test/Validation sets

In [None]:
df = pd.read_pickle(data_path + 'df_filtered.pkl')
df_target = pd.read_pickle(data_path + 'df_target_filtered.pkl')

In [None]:
df_train = df[df['TrialID'].isin(test_trials)]
df_train_target = df_target[df_target['TrialID'].isin(test_trials)]

df_hyp = df[df['TrialID'].isin(hyp_trials)]
df_hyp_target = df_target[df_target['TrialID'].isin(hyp_trials)]

#### Feature Selection

In [None]:
df_hyp = df_hyp.drop(columns=['TrialID'])

df_0 = df_hyp.loc[df_hyp_target[df_hyp_target['Target'] == 0].index.values]
df_1 = df_hyp.loc[df_hyp_target[df_hyp_target['Target'] == 1].index.values]
df_2 = df_hyp.loc[df_hyp_target[df_hyp_target['Target'] == 2].index.values]
df_3 = df_hyp.loc[df_hyp_target[df_hyp_target['Target'] == 3].index.values]

In [None]:
def feature_selection(data0, data1):
    _, p_val = mannwhitneyu(data0, data1)
    rej, _, _, _ = multipletests(p_val, alpha=0.05, method='holm')
    columns = df_hyp.columns[rej]
    return list(columns)

columns = []
df_all = list(itertools.combinations([df_0, df_1, df_2, df_3], 2))
for d in df_all:
    d1, d2 = d
    cols = feature_selection(d1, d2)
    columns += cols

columns = list(set(columns))

df_hyp = df_hyp.reset_index()
corr_matrix = df_hyp.corr(method='spearman').abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]

cols = np.setdiff1d(columns, to_drop)

In [None]:
features = []
for feat in cols:
    if 'TrialID' not in feat:
        features.append(feat)

In [None]:
file = open(data_path + 'features.txt', 'w+')
features = []
for feat in cols:
    if 'TrialID' not in feat:
        features.append(feat)
        file.write(feat+'\n')

file.close()

In [None]:
features = open(data_path + 'features.txt', 'r+').read().split('\n')
features.remove('')

In [None]:
len(features)

## Model

#### Create Folds

In [None]:
def create_folds():
    final_folds = []
    for v1 in list(itertools.product([2,3], [15,16])):
        for v2 in list(itertools.product([6,7,8], [11,12])):
            train_trials = list(v1) + list(v2)
            final_folds.append(train_trials)
    return final_folds


In [None]:
a = create_folds()
file = open(data_path+'folds.txt', 'w+')
for f in a:
    file.write(str(f) +'\n')
file.close()

#### Model Building

In [None]:
def get_track_prediction(y_true, scores, preds, groups, classes):
    unique_groups = groups.unique()
    track_preds = []
    track_true = []
    avg_scores = []
    for val in unique_groups:
        indexes = np.where(groups == val)[0]
        track_true.append(mode(y_true.values[indexes]))
        score = np.mean(scores[indexes], axis=0)
        score_index = np.argmax(score)
        avg_scores.append(score[score_index])
        track_preds.append(classes[score_index])
    return track_true, track_preds, avg_scores


def produce_report(y_test, y_pred, scores):
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    data = {
        'accuracy': metrics.accuracy_score(y_test,y_pred),
        'balanced accuracy': metrics.balanced_accuracy_score(y_test,y_pred),
        'report': report
    }
    return data

In [None]:
def xgboost_model(x_train, y_train, x_test, y_test, params):
    model = XGBClassifier(
        objective='multi:softmax',
        num_classes=4,
        **params
    )
    model.fit(x_train, y_train['Target'])
    y_pred = model.predict(x_test)
    scores = model.predict_proba(x_test)
    track_true, track_preds, avg_scores = get_track_prediction(y_test['Target'], scores, y_pred, y_test['TrackGroup'], model.classes_)
    return produce_report(track_true, track_preds, avg_scores), {
        'track-preds': track_true, 
        'track-target': track_preds
    }, model

In [None]:
results = dict(
    xgboost_train = [],
    xgboost_test = []
)

train_track_preds = []
test_track_preds = []

folds = create_folds()

df_train_target['Target'] = df_train_target['Target'].astype(int)

for index, fold in enumerate(folds):
    print(f' --- FOLD {index} ---')
    train_trials = fold
    mask = df_train_target['TrialID'].isin(train_trials)

    train = df_train[mask]
    train_targets = df_train_target[mask]
    
    test = df_train[~mask]
    test_targets = df_train_target[~mask]

    scaler = StandardScaler()
    train = scaler.fit_transform(train[features])
    test = scaler.transform(test[features])

    sm = SMOTE(
        random_state=0
    )
    train_os, train_targets_os = sm.fit_resample(train, train_targets.drop(columns=['TrialID','TrackGroup']))
    train_targets_os = train_targets_os.astype(int)

    xg_scores, segment_scores, model = xgboost_model(
        x_train=train_os, 
        y_train=train_targets_os, 
        x_test=train, 
        y_test=train_targets,
        params=dict(
            random_state=0,
            learning_rate=0.3,
            n_estimators=200,
            max_depth=5,
            subsample=0.5,
            colsample_bytree=0.5,
            reg_alpha=0.1,
            reg_lambda=0.1,
            min_child_weight=5
        ))
    results['xgboost_train'].append(xg_scores)
    train_track_preds.append(segment_scores)

    xg_scores, segment_scores, model = xgboost_model(
        x_train=train_os, 
        y_train=train_targets_os, 
        x_test=test, 
        y_test=test_targets,
        params=dict(
            random_state=0,
            learning_rate=0.3,
            n_estimators=200,
            max_depth=5,
            subsample=0.5,
            colsample_bytree=0.5,
            reg_alpha=0.1,
            reg_lambda=0.1,
            min_child_weight=5
        ))
    results['xgboost_test'].append(xg_scores)
    test_track_preds.append(segment_scores)
    joblib.dump(dict(
        model=model,
        df_train=df_train,
        df_train_target=df_train_target,
        features=features,
        test=test,
        mask=mask,
        train_os=train_os,
    ), data_path+f'shap/xgboost_shap_dump_{index}.dat')

In [None]:
with open(results_path+'train_track_preds.pkl', 'wb') as f:
    pickle.dump(train_track_preds, f)
    
with open(results_path+'results.pkl', 'wb') as f:
    pickle.dump(results, f)

with open(results_path+'test_track_preds.pkl', 'wb') as f:
    pickle.dump(test_track_preds, f)

In [None]:
acc_0 = []
acc_1 = []
acc_2 = []
acc_3 = []

for s in test_track_preds:
    preds = s['track-preds']
    actual = s['track-target']#.values

    indices = [i for i, x in enumerate(actual) if x == 0]
    n_correct = sum(preds[i] == 0 for i in indices)
    n_total = len(indices)
    acc = n_correct / n_total
    acc_0.append(acc)

    indices = [i for i, x in enumerate(actual) if x == 1]
    n_correct = sum(preds[i] == 1 for i in indices)
    n_total = len(indices)
    acc = n_correct / n_total
    acc_1.append(acc)

    indices = [i for i, x in enumerate(actual) if x == 2]
    n_correct = sum(preds[i] == 2 for i in indices)
    n_total = len(indices)
    acc = n_correct / n_total
    acc_2.append(acc)

    indices = [i for i, x in enumerate(actual) if x == 3]
    n_correct = sum(preds[i] == 3 for i in indices)
    n_total = len(indices)
    acc = n_correct / n_total
    acc_3.append(acc)


print(f'{round(np.mean(acc_0), 3)} ({round(min(acc_0), 3)} - {round(max(acc_0), 3)})')
print(f'{round(np.mean(acc_1), 3)} ({round(min(acc_1), 3)} - {round(max(acc_1), 3)})')
print(f'{round(np.mean(acc_2), 3)} ({round(min(acc_2), 3)} - {round(max(acc_2), 3)})')
print(f'{round(np.mean(acc_3), 3)} ({round(min(acc_3), 3)} - {round(max(acc_3), 3)})')


In [None]:
with open(results_path+'multiclass-results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [None]:
with open(results_path+'multiclass-results.pkl', 'rb') as f:
    results = pickle.load(f)

In [None]:

wb = openpyxl.Workbook()
sheet = wb.create_sheet()

row = 2
metrics_list = ['accuracy', 'balanced accuracy']
for i, column in enumerate(['model'] + metrics_list):
    sheet.cell(row=1, column=i+1).value = column

for model in ['xgboost']:
    for model_type in ['train', 'test']:
        try:
            sheet.cell(row=row, column=1).value = f'{model.upper()} {model_type.upper()}'
            for j, metric in enumerate(metrics_list):
                    scores = []
                    for fold in range(len(results[model+'_'+model_type])):
                        scores.append(results[model+'_'+model_type][fold][metric])
                    
                    sheet.cell(row=row, column=j+2).value = f'{round(np.mean(scores), 3)} ({round(min(scores), 3)} - {round(max(scores), 3)})'
            row += 1
        except:
            pass

wb.save(results_path + 'multiclass-scores.xlsx')

#### Graphs

In [None]:
'''CONFUSION MATRIX'''

def plot_confusion_matrix(target, preds, classifier):
    plt.figure(dpi=300)
    sns.heatmap(metrics.confusion_matrix(
        target, preds),
        annot=True, xticklabels=labels, 
        yticklabels=labels, fmt='g', cmap="flare")
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    #plt.title(f'{classifier} Confusion Matrix')
    plt.show()
    
def plot_confusion_matrix(target, preds, classifier, labels):
    cm = metrics.confusion_matrix(target, preds)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.figure(dpi=300)
    sns.heatmap(cm_normalized, annot=True, xticklabels=labels, yticklabels=labels, fmt='.2%', cmap="flare")
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    #plt.title(f'{classifier} Confusion Matrix')
    plt.show()

labels = ['Banfora', 'Kisumu', 'Ngoussu', 'VK7']

target = []
preds = []
for index in range(len(train_track_preds)):
    target += list(train_track_preds[index]['track-target'])
    preds += list(train_track_preds[index]['track-preds'])
plot_confusion_matrix(target, preds, 'XGBoost (train)', labels)

target = []
preds = []
for index in range(len(test_track_preds)):
    target += list(test_track_preds[index]['track-target'])
    preds += list(test_track_preds[index]['track-preds'])
plot_confusion_matrix(target, preds, 'XGBoost (test)', labels)


In [None]:
'''EXCEL FILE OF ALL FOLD SCORES'''

wb = openpyxl.Workbook()
for key in ['xgboost_train','xgboost_test']:
    sheet = wb.create_sheet(key.upper())
    columns = ['fold', 'test trials', 'train trials', 'accuracy', 'balanced accuracy']
    for i, column in enumerate(columns):
        sheet.cell(row=1, column=i+1).value = column

        for row in range(len(results[key])):
            if column not in ['fold', 'test trials', 'train trials']:
                sheet.cell(row=row+2, column=i+1).value = results[key][row][column]
        
            elif column == 'fold':
                sheet.cell(row=row+2, column=i+1).value = row

            elif column == 'test trials':
                train = folds[row]
                all_ids = df_train_target['TrialID'].unique()
                sheet.cell(row=row+2, column=i+1).value = str([x for x in all_ids if x not in train])

            elif column == 'train trials':
                sheet.cell(row=row+2, column=i+1).value = str(folds[row])

wb.save(results_path + "multiclass-all-folds.xlsx")
