# 1. Dataset preparation

In [7]:
## import necessary packages and utility functions
from utils import *
import os
import pickle
from time import time
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score,precision_score,classification_report
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")



## a. Convert track features to spots info
Assuming the registration and pairing steps are done.

In [2]:
os.chdir('/Users/yifan/Dropbox/ZYF/dev/centracker_debug/train_trial/')

## features -> spots
features = pd.read_csv('predictions.csv')
r_xml_path = 'r_Film2.xml'
movie = 'r_Film2'
framerate = 29.98
conversion = {'x': 0.3002646, 'y': 0.3002646, 'z':0.5}
output_csv_path = 'spots_all.csv'

features2spots(features,r_xml_path,movie,framerate,conversion,output_csv_path)

Number of cells: 431


## b. Manually validate the track pairs
- via MATLAB GUI
- generate 1/0 (True/False) labels on the *predictions.csv* file

## c. Convert the track pair features & labels to a sklearn-usable format

In [4]:
from sklearn.preprocessing import MinMaxScaler

data = pd.read_csv('r_Film2_labels.csv')
scaler = MinMaxScaler()
data['contrast_normalized'] = scaler.fit_transform(data['contrast'].values.reshape(-1,1))
data['intensity_normalized'] = scaler.fit_transform(data['intensity'].values.reshape(-1,1))


true = data[data['True_Label']==1]
false = data[data['True_Label']==0]
false = false.sample(100, random_state=3020)
data = pd.concat([false, true], axis=0)

y = data[['True_Label']]
X = data[['center_stdev','normal_stdev','sl_f', 'sl_i', 'sl_max', 'sl_min', 't_cong',
        't_overlap', 'intensity_normalized', 'diameter', 'contrast_normalized']]

X.to_csv('X.csv')
y.to_csv('y.csv')

# 2. Classifier

## a. Hyperparameter tuning

In [9]:
# get data
X = pd.read_csv('X.csv',index_col=0)
y = pd.read_csv('y.csv',index_col=0)
X = X.to_numpy()
y = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12)
    

# build a classifier
clf = RandomForestClassifier(criterion='gini')


# use a full grid over all parameters
param_grid = {'min_impurity_decrease':[0.0],
              'n_estimators': list(range(10,200,5)),
              'warm_start':[False]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid,cv=3,scoring='accuracy')
start = time()
grid_search.fit(X_train, y_train.ravel())

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

GridSearchCV took 11.31 seconds for 38 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.894 (std: 0.037)
Parameters: {'min_impurity_decrease': 0.0, 'n_estimators': 145, 'warm_start': False}

Model with rank: 2
Mean validation score: 0.885 (std: 0.012)
Parameters: {'min_impurity_decrease': 0.0, 'n_estimators': 95, 'warm_start': False}

Model with rank: 3
Mean validation score: 0.876 (std: 0.032)
Parameters: {'min_impurity_decrease': 0.0, 'n_estimators': 25, 'warm_start': False}

Model with rank: 3
Mean validation score: 0.876 (std: 0.032)
Parameters: {'min_impurity_decrease': 0.0, 'n_estimators': 75, 'warm_start': False}



## b. Classifier training

In [10]:
final_clf = RandomForestClassifier(min_impurity_decrease=0.0,criterion='gini',warm_start=False, n_estimators=145)
final_clf.fit(X_train, np.ravel(y_train,order='C'))
y_pred = final_clf.predict(X_test)
y_true = np.ravel(y_test,order='C')
pre_score = precision_score(y_true, y_pred, average='weighted')
acc_score = accuracy_score(y_true, y_pred)
f1_score = f1_score(y_true,y_pred)
print(classification_report(y_true, y_pred))
print(pre_score)
print(acc_score)
print(f1_score)
filename = 'myModel.sav'
pickle.dump(final_clf, open(filename, 'wb'))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84        34
           1       0.81      0.59      0.68        22

    accuracy                           0.79        56
   macro avg       0.79      0.75      0.76        56
weighted avg       0.79      0.79      0.78        56

0.7897321428571429
0.7857142857142857
0.6842105263157896
