# The trainable option
## 1. Dataset preparation

In [None]:
## import necessary packages and utility functions
import sys
sys.path.append('../src/')
from utils import *
import os
import pickle
from time import time
import scipy.stats as stats
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score,precision_score,classification_report
from sklearn.preprocessing import MinMaxScaler

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

## step 1: Registration, tracking and pairing (Modules 1-3)


## step 2: Convert the track features to spots info

In [None]:
root = '../data/train_example/'
m_name = 'Film2'

In [None]:
## features -> spots
features = pd.read_csv('{}/{}/predictions.csv'.format(root, m_name))
r_xml_path = '{}/{}/r_{}.xml'.format(root, m_name, m_name)
r_movie = '{}/{}/r_{}'.format(root,m_name,m_name)
output_csv_path = '{}/{}/r_{}.txt'.format(root, m_name, m_name)

features2spots(features,r_xml_path,r_movie,output_csv_path)

## step 3: Label the putative track pairs - see README

## step 4: Convert the track pair features & labels to a sklearn-usable format

In [None]:
true = pd.read_csv('{}/{}/True.csv'.format(root,m_name), index_col=0)

data = pd.read_csv('{}/{}/predictions.csv'.format(root,m_name))
data['True_Label'] = true['True_pairs'].values
scaler = MinMaxScaler()
data['contrast_normalized'] = scaler.fit_transform(data['contrast'].values.reshape(-1,1))
data['intensity_normalized'] = scaler.fit_transform(data['intensity'].values.reshape(-1,1))

true = data[data['True_Label']==1]
false = data[data['True_Label']==0]
false = false.sample(100, random_state=3020)
data = pd.concat([false, true], axis=0)

y = data[['True_Label']]
X = data[['center_stdev','normal_stdev','sl_f', 'sl_i', 'sl_max', 'sl_min', 't_cong',
        't_overlap', 'intensity_normalized', 'diameter', 'contrast_normalized']]

X.to_csv('{}/{}/X.csv'.format(root, m_name))
y.to_csv('{}/{}/y.csv'.format(root, m_name))

## Additional: batch mode

In [None]:
root = '../data/train_example'

In [None]:
# get all movienames
(_,movie_names,_) = next(os.walk(root))
print('Folders detected: ')
for m in movie_names:
    print(m)

In [None]:
for m_name in movie_names:
    ## features -> spots
    features = pd.read_csv('{}/{}/predictions.csv'.format(root, m_name))
    r_xml_path = '{}/{}/r_{}.xml'.format(root, m_name, m_name)
    r_movie = '{}/{}/r_{}'.format(root,m_name,m_name)
    output_csv_path = '{}/{}/r_{}.txt'.format(root, m_name, m_name)
    features2spots(features,r_xml_path,r_movie,output_csv_path)

In [None]:
for m_name in movie_names:
    true = pd.read_csv('{}/{}/True.csv'.format(root,m_name), index_col=0)

    data = pd.read_csv('{}/{}/predictions.csv'.format(root,m_name))
    data['True_Label'] = true['True_pairs'].values
    scaler = MinMaxScaler()
    data['contrast_normalized'] = scaler.fit_transform(data['contrast'].values.reshape(-1,1))
    data['intensity_normalized'] = scaler.fit_transform(data['intensity'].values.reshape(-1,1))

    true = data[data['True_Label']==1]
    false = data[data['True_Label']==0]
    false = false.sample(100, random_state=3020)
    data = pd.concat([false, true], axis=0)

    y = data[['True_Label']]
    X = data[['center_stdev','normal_stdev','sl_f', 'sl_i', 'sl_max', 'sl_min', 't_cong',
            't_overlap', 'intensity_normalized', 'diameter', 'contrast_normalized']]

    X.to_csv('{}/{}/X.csv'.format(root, m_name))
    y.to_csv('{}/{}/y.csv'.format(root, m_name))

# 2. Classifier

## a. Hyperparameter tuning

In [None]:
# get data, if single movie
root = '../data/train_example/'
m_name = 'Film2'

X = pd.read_csv('{}/{}/X.csv'.format(root, m_name),index_col=0)
y = pd.read_csv('{}/{}/y.csv'.format(root, m_name),index_col=0)
X = X.to_numpy()
y = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12)

if batch mode, uncomment below (shortcut: Ctrl+'/', or Cmd+'/')

In [None]:
## get data
# root = '../data/train_example/'
# (_,movie_names,_) = next(os.walk(root))
# x_list = []
# y_list = []
# for m in movie_names:
#     x_list.append(pd.read_csv('{}/{}/X.csv'.format(root, m_name),index_col=0))
#     y_list.append(pd.read_csv('{}/{}/y.csv'.format(root, m_name),index_col=0))
# # concatenate
# X = pd.concat(x_list,axis=1)
# y = pd.concat(y_list,axis=1)
# X = X.to_numpy()
# y = y.to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12)

In [None]:
# build a classifier
clf = RandomForestClassifier(criterion='gini')


# use a full grid over all parameters
param_grid = {'min_impurity_decrease':[0.0],
              'n_estimators': list(range(10,200,5)),
              'warm_start':[False]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid,cv=3,scoring='accuracy')
start = time()
grid_search.fit(X_train, y_train.ravel())

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

## b. Classifier training

In [None]:
final_clf = RandomForestClassifier(min_impurity_decrease=0.0,criterion='gini',warm_start=False, n_estimators=145)
final_clf.fit(X_train, np.ravel(y_train,order='C'))
y_pred = final_clf.predict(X_test)
y_true = np.ravel(y_test,order='C')
pre_score = precision_score(y_true, y_pred, average='weighted')
acc_score = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true,y_pred)
# print(classification_report(y_true, y_pred))
# print(pre_score)
# print(acc_score)
# print(f1_score)
filename = 'myModel.sav'
pickle.dump(final_clf, open(filename, 'wb'))