In [2]:
# import beyourself
import pandas as pd
import numpy as np
import os
from sklearn.manifold import TSNE
import time
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import confusion_matrix, mean_squared_error

In [3]:
def standard_normalizer(x):
    x = x.T
    # compute the mean and standard deviation of the input
    x_means = np.mean(x,axis = 1)[:,np.newaxis]
    x_stds = np.std(x,axis = 1)[:,np.newaxis]   

    # create standard normalizer function based on input data statistics
    normalizer = lambda data: ((data.T - x_means)/x_stds).T
    
    # return normalizer and inverse_normalizer
    return normalizer

In [4]:
def calc_multi_cm(y_gt, y_pred):    
    # ct = pd.crosstab(y_gt, y_pred, rownames=['True'], colnames=['Predicted'], margins=True).apply(lambda r: r/r.sum(), axis=1)
    ct = pd.crosstab(y_gt, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
    # print(ct)
    # ct.to_csv(cm_file)

    # Compute confusion matrix
    multi_cm = confusion_matrix(y_gt, y_pred)
    
    accuracy = sum(multi_cm[i,i] for i in range(len(set(y_gt))))/sum(sum(multi_cm[i] for i in range(len(set(y_gt)))))
    recall_all = sum(multi_cm[i,i]/sum(multi_cm[i,j] for j in range(len(set(y_gt)))) for i in range(len(set(y_gt))))/(len(set(y_gt)))
    precision_all = sum(multi_cm[i,i]/sum(multi_cm[j,i] for j in range(len(set(y_gt)))) for i in range(len(set(y_gt))))/(len(set(y_gt)))
    fscore_all = sum(2*(multi_cm[i,i]/sum(multi_cm[i,j] for j in range(len(set(y_gt)))))*(multi_cm[i,i]/sum(multi_cm[j,i] for j in range(len(set(y_gt)))))/(multi_cm[i,i]/sum(multi_cm[i,j] for j in range(len(set(y_gt))))+multi_cm[i,i]/sum(multi_cm[j,i] for j in range(len(set(y_gt))))) for i in range(len(set(y_gt))))/len(set(y_gt))
    
    result={}

    for i in np.unique(y_gt):

        i_gt = (y_gt==i).astype(int)
        i_pred = (y_pred==i).astype(int)

        cm = confusion_matrix(i_gt, i_pred)

        i_result = {}

        TP = cm[1,1]
        FP = cm[0,1]
        TN = cm[0,0]
        FN = cm[1,0]
        # Precision for Positive = TP/(TP + FP)
        prec_pos = TP/(TP + FP)
        # F1 score for positive = 2 * precision * recall / (precision + recall)….or it can be F1= 2*TP/(2*TP + FP+ FN)
        f1_pos = 2*TP/(TP*2 + FP+ FN)
        # TPR = TP/(TP+FN)
        TPR = cm[1,1]/sum(cm[1,j] for j in range(len(set(i_gt))))

        i_result = {'recall': TPR, 'precision': prec_pos, 'f1': f1_pos}

        result[str(int(i))] = i_result

    ave_result = {'accuracy':accuracy, 'recall_all':recall_all, 'precision_all':precision_all, 'fscore_all':fscore_all}
    result['average'] = ave_result

    return result, multi_cm


In [15]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold,KFold

#read data first
fd_folder='All/fd/fd_groundtruthsegmentation.csv'
dd_folder='All/dd/dd_groundtruthsegmentation.csv'
#sd_folder='Groundtruthsegmentation_all/sd/sd_groundtruthsegmentation.csv'
null_folder='All/null/null_groundtruthsegmentation.csv'
fd,dd=pd.read_csv(fd_folder),pd.read_csv(dd_folder)
null=pd.read_csv(null_folder)
#definle label AND normalize
positive_group=pd.concat([fd,dd])
positive_group['label']=1
null['label']=0
total_group=pd.concat([positive_group,null])

#cluster number
cluster_number=2

clf_list=[svm.SVC(),RandomForestClassifier(),LogisticRegression()]
clf_names=['svm','rf','lr']

skf = StratifiedKFold(n_splits=10)

for clf_name in clf_names:
    vars()['gt_all_folds_'+clf_name] = []
    vars()['pd_all_folds_'+clf_name] = []

iFold = 0
for XY_trn_idx, XY_test_idx in skf.split(total_group.values[:,:-1],
                                        total_group['label'].values):
    
    print('------iFold:'+str(iFold)+'---------------------')


    trn_group = total_group.iloc[XY_trn_idx]
    test_group = total_group.iloc[XY_test_idx]
    X_train, X_test = trn_group.values[:,:-1], test_group.values[:,:-1]
    y_train, y_test = trn_group.values[:,-1], test_group.values[:,-1]
    
    #fit the data
    kmeans = MiniBatchKMeans(n_clusters=cluster_number).fit(X_train)

    # train
    # ........assign each data a label
    trn_group.loc[:,'cluster_id']=kmeans.predict(X_train)

    for i in range(cluster_number):
        vars()['XY_trn_cluster_'+str(i)] = trn_group[trn_group['cluster_id']==i]

        for (clf_type, clf_name) in zip(clf_list, clf_names):
            vars()['cluster'+str(i)+'_'+clf_name] = clf_type.fit(vars()['XY_trn_cluster_'+str(i)].values[:,:-2],
                                                                 vars()['XY_trn_cluster_'+str(i)]['label'].values)
    
    # test
    # ........assign each data a label
    test_group.loc[:,'cluster_id']=kmeans.predict(X_test)
    
    for clf_name in clf_names:
        vars()['gt_all_clusters_'+clf_name] = []
        vars()['pd_all_clusters_'+clf_name] = []
    
        for i in range(cluster_number):
            vars()['XY_test_cluster_'+str(i)] = test_group[test_group['cluster_id']==i]
            vars()['gt_all_clusters_'+clf_name].append(vars()['XY_test_cluster_'+str(i)]['label'].values)
            vars()['pd_all_clusters_'+clf_name].append(
                vars()['cluster'+str(i)+'_'+clf_name].predict(vars()['XY_test_cluster_'+str(i)].values[:,:-2]))
        
        vars()['gt_all_clusters_'+clf_name]=np.concatenate(vars()['gt_all_clusters_'+clf_name])
        vars()['pd_all_clusters_'+clf_name]=np.concatenate(vars()['pd_all_clusters_'+clf_name])

        result,_cm = calc_multi_cm(vars()['gt_all_clusters_'+clf_name], vars()['pd_all_clusters_'+clf_name])
        print('---------------------'+clf_name+'----------------')
        print(result['1']['f1'])
#         print(result)
        print(_cm)

        vars()['pd_all_folds_'+clf_name].append(vars()['pd_all_clusters_'+clf_name])
        vars()['gt_all_folds_'+clf_name].append(vars()['gt_all_clusters_'+clf_name])

    iFold += 1

print('---------------------Final----------------')

for clf_name in clf_names:
    vars()['gt_all_folds_'+clf_name]=np.concatenate(vars()['gt_all_folds_'+clf_name])
    vars()['pd_all_folds_'+clf_name]=np.concatenate(vars()['pd_all_folds_'+clf_name])
    
    result,_cm = calc_multi_cm(vars()['gt_all_folds_'+clf_name], vars()['pd_all_folds_'+clf_name])

    print('---------------------'+clf_name+'----------------')
    print(result['1']['f1'])
    print(result)
    print(_cm)

# normalizer=standard_normalizer(total_group.values[:,:-1])
# total_group.iloc[:,:-1]=normalizer(total_group.values[:,:-1])
# total_group['cluster_id']=0

------iFold:0---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the document

---------------------svm----------------
0.0444444444444
[[156   1]
 [ 42   1]]
---------------------rf----------------
0.4
[[156   1]
 [ 32  11]]
---------------------lr----------------
0.517647058824
[[137  20]
 [ 21  22]]
------iFold:1---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[129  28]
 [ 43   0]]
---------------------rf----------------
0.282608695652
[[121  36]
 [ 30  13]]
---------------------lr----------------
0.404494382022
[[129  28]
 [ 25  18]]
------iFold:2---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[141  16]
 [ 43   0]]
---------------------rf----------------
0.386363636364
[[129  28]
 [ 26  17]]
---------------------lr----------------
0.506024096386
[[138  19]
 [ 22  21]]
------iFold:3---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[138  19]
 [ 42   0]]
---------------------rf----------------
0.25
[[129  28]
 [ 32  10]]
---------------------lr----------------
0.368932038835
[[115  42]
 [ 23  19]]
------iFold:4---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[157   0]
 [ 42   0]]
---------------------rf----------------
0.483870967742
[[152   5]
 [ 27  15]]
---------------------lr----------------
0.632911392405
[[145  12]
 [ 17  25]]
------iFold:5---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[149   8]
 [ 42   0]]
---------------------rf----------------
0.233333333333
[[146  11]
 [ 35   7]]
---------------------lr----------------
0.395833333333
[[122  35]
 [ 23  19]]
------iFold:6---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[147  10]
 [ 42   0]]
---------------------rf----------------
0.266666666667
[[134  23]
 [ 32  10]]
---------------------lr----------------
0.330827067669
[[88 69]
 [20 22]]
------iFold:7---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[156   1]
 [ 42   0]]
---------------------rf----------------
0.3125
[[145  12]
 [ 32  10]]
---------------------lr----------------
0.188235294118
[[122  35]
 [ 34   8]]
------iFold:8---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[156   0]
 [ 42   0]]
---------------------rf----------------
0.387096774194
[[123  33]
 [ 24  18]]
---------------------lr----------------
0.254777070064
[[61 95]
 [22 20]]
------iFold:9---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.125
[[153   3]
 [ 39   3]]
---------------------rf----------------
0.428571428571
[[110  46]
 [ 18  24]]
---------------------lr----------------
0.403100775194
[[95 61]
 [16 26]]
---------------------Final----------------
---------------------svm----------------
0.0155945419103
{'0': {'recall': 0.94515306122448983, 'precision': 0.77958968963703312, 'f1': 0.85442490631305856}, '1': {'recall': 0.0094562647754137114, 'precision': 0.044444444444444446, 'f1': 0.015594541910331383}, 'average': {'accuracy': 0.74635861376192869, 'recall_all': 0.47730466299995178, 'precision_all': 0.41201706704073876, 'fscore_all': 0.43500972411169497}}
[[1482   86]
 [ 419    4]]
---------------------rf----------------
0.345710627401
{'0': {'recall': 0.85778061224489799, 'precision': 0.82363747703612977, 'f1': 0.84036238675413932}, '1': {'recall': 0.31914893617021278, 'precision': 0.37709497206703912, 'f1': 0.34571062740076824}, 'average': {'accuracy': 0.74334505273731