In [10]:
# import beyourself
import pandas as pd
import numpy as np
import os
from sklearn.manifold import TSNE
import time
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import confusion_matrix, mean_squared_error
from utils import balanced_subsample
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler


## train: DBSACN cluster，
## test: kNN classification

In [3]:
def standard_normalizer(x):
    x = x.T
    # compute the mean and standard deviation of the input
    x_means = np.mean(x,axis = 1)[:,np.newaxis]
    x_stds = np.std(x,axis = 1)[:,np.newaxis]   

    # create standard normalizer function based on input data statistics
    normalizer = lambda data: ((data.T - x_means)/x_stds).T
    
    # return normalizer and inverse_normalizer
    return normalizer

In [4]:
def calc_multi_cm(y_gt, y_pred):    
    # ct = pd.crosstab(y_gt, y_pred, rownames=['True'], colnames=['Predicted'], margins=True).apply(lambda r: r/r.sum(), axis=1)
    ct = pd.crosstab(y_gt, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)
    # print(ct)
    # ct.to_csv(cm_file)

    # Compute confusion matrix
    multi_cm = confusion_matrix(y_gt, y_pred)
    
    accuracy = sum(multi_cm[i,i] for i in range(len(set(y_gt))))/sum(sum(multi_cm[i] for i in range(len(set(y_gt)))))
    recall_all = sum(multi_cm[i,i]/sum(multi_cm[i,j] for j in range(len(set(y_gt)))) for i in range(len(set(y_gt))))/(len(set(y_gt)))
    precision_all = sum(multi_cm[i,i]/sum(multi_cm[j,i] for j in range(len(set(y_gt)))) for i in range(len(set(y_gt))))/(len(set(y_gt)))
    fscore_all = sum(2*(multi_cm[i,i]/sum(multi_cm[i,j] for j in range(len(set(y_gt)))))*(multi_cm[i,i]/sum(multi_cm[j,i] for j in range(len(set(y_gt)))))/(multi_cm[i,i]/sum(multi_cm[i,j] for j in range(len(set(y_gt))))+multi_cm[i,i]/sum(multi_cm[j,i] for j in range(len(set(y_gt))))) for i in range(len(set(y_gt))))/len(set(y_gt))
    
    result={}

    for i in np.unique(y_gt):

        i_gt = (y_gt==i).astype(int)
        i_pred = (y_pred==i).astype(int)

        cm = confusion_matrix(i_gt, i_pred)

        i_result = {}

        TP = cm[1,1]
        FP = cm[0,1]
        TN = cm[0,0]
        FN = cm[1,0]
        # Precision for Positive = TP/(TP + FP)
        prec_pos = TP/(TP + FP)
        # F1 score for positive = 2 * precision * recall / (precision + recall)….or it can be F1= 2*TP/(2*TP + FP+ FN)
        f1_pos = 2*TP/(TP*2 + FP+ FN)
        # TPR = TP/(TP+FN)
        TPR = cm[1,1]/sum(cm[1,j] for j in range(len(set(i_gt))))

        i_result = {'recall': TPR, 'precision': prec_pos, 'f1': f1_pos}

        result[str(int(i))] = i_result

    ave_result = {'accuracy':accuracy, 'recall_all':recall_all, 'precision_all':precision_all, 'fscore_all':fscore_all}
    result['average'] = ave_result

    return result, multi_cm


In [22]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold,KFold

#read data first
fd_folder='All/fd/fd_groundtruthsegmentation.csv'
dd_folder='All/dd/dd_groundtruthsegmentation.csv'
sd_folder='All/sd/sd_groundtruthsegmentation.csv'
cd_folder='All/cd/cd_groundtruthsegmentation.csv'
null_folder='All/null/null_groundtruthsegmentation.csv'
fd,dd,sd,cd=pd.read_csv(fd_folder),pd.read_csv(dd_folder),pd.read_csv(sd_folder),pd.read_csv(cd_folder)
null=pd.read_csv(null_folder)

#definle label
positive_group=pd.concat([fd,dd])
positive_group['label']=1

negative_group=pd.concat([null])
negative_group['label']=0
total_group=pd.concat([positive_group,negative_group])

#cluster number
cluster_number=4

clf_list=[svm.SVC(),RandomForestClassifier(n_estimators=185),LogisticRegression()]
clf_names=['svm','rf','lr']

skf = StratifiedKFold(n_splits=10)

for clf_name in clf_names:
    vars()['gt_all_folds_'+clf_name] = []
    vars()['pd_all_folds_'+clf_name] = []

iFold = 0
for XY_trn_idx, XY_test_idx in skf.split(total_group.values[:,:-1],
                                        total_group['label'].values):
    
    print('------iFold:'+str(iFold)+'---------------------')


    trn_group = total_group.iloc[XY_trn_idx]
    names = trn_group.columns
    test_group = total_group.iloc[XY_test_idx]
    X_train, X_test = trn_group.values[:,:-1], test_group.values[:,:-1]
    y_train, y_test = trn_group.values[:,-1], test_group.values[:,-1]

    # subsampling balancing will harm clustering, so cannot be used
#     X_train, y_train = balanced_subsample(X_train,y_train,subsample_size=1.0)

    X_train, y_train = SMOTE().fit_sample(X_train, y_train)
    trn_group = pd.DataFrame(data=np.hstack((X_train,y_train[:,np.newaxis])),columns=names)


    
    # normalization
    normalizer = standard_normalizer(X_train)
    X_train = normalizer(X_train)
    X_test = normalizer(X_test)
    
    #fit the data
    kmeans = MiniBatchKMeans(n_clusters=cluster_number).fit(X_train)

    # train
    # ........assign each data a label
    trn_group.loc[:,'cluster_id']=kmeans.predict(X_train)

    for i in range(cluster_number):
        vars()['XY_trn_cluster_'+str(i)] = trn_group[trn_group['cluster_id']==i]

        for (clf_type, clf_name) in zip(clf_list, clf_names):
            vars()['cluster'+str(i)+'_'+clf_name] = clf_type.fit(vars()['XY_trn_cluster_'+str(i)].values[:,:-2],
                                                                 vars()['XY_trn_cluster_'+str(i)]['label'].values)
    
    # test
    # ........assign each data a label
    test_group.loc[:,'cluster_id']=kmeans.predict(X_test)
    
    for clf_name in clf_names:
        vars()['gt_all_clusters_'+clf_name] = []
        vars()['pd_all_clusters_'+clf_name] = []
    
        for i in range(cluster_number):
            vars()['XY_test_cluster_'+str(i)] = test_group[test_group['cluster_id']==i]
            vars()['gt_all_clusters_'+clf_name].append(vars()['XY_test_cluster_'+str(i)]['label'].values)
            vars()['pd_all_clusters_'+clf_name].append(
                vars()['cluster'+str(i)+'_'+clf_name].predict(vars()['XY_test_cluster_'+str(i)].values[:,:-2]))
        
        vars()['gt_all_clusters_'+clf_name]=np.concatenate(vars()['gt_all_clusters_'+clf_name])
        vars()['pd_all_clusters_'+clf_name]=np.concatenate(vars()['pd_all_clusters_'+clf_name])

        result,_cm = calc_multi_cm(vars()['gt_all_clusters_'+clf_name], vars()['pd_all_clusters_'+clf_name])
        print('---------------------'+clf_name+'----------------')
        print(result['1']['f1'])
#         print(result)
        print(_cm)

        vars()['pd_all_folds_'+clf_name].append(vars()['pd_all_clusters_'+clf_name])
        vars()['gt_all_folds_'+clf_name].append(vars()['gt_all_clusters_'+clf_name])

    iFold += 1

print('---------------------Final----------------')

for clf_name in clf_names:
    vars()['gt_all_folds_'+clf_name]=np.concatenate(vars()['gt_all_folds_'+clf_name])
    vars()['pd_all_folds_'+clf_name]=np.concatenate(vars()['pd_all_folds_'+clf_name])
    
    result,_cm = calc_multi_cm(vars()['gt_all_folds_'+clf_name], vars()['pd_all_folds_'+clf_name])

    print('---------------------'+clf_name+'----------------')
    print(result['1']['f1'])
    print(result)
    print(_cm)


------iFold:0---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0444444444444
[[156   1]
 [ 42   1]]
---------------------rf----------------
0.166666666667
[[156   1]
 [ 39   4]]
---------------------lr----------------
0.721649484536
[[138  19]
 [  8  35]]
------iFold:1---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[154   3]
 [ 43   0]]
---------------------rf----------------
0.514285714286
[[148   9]
 [ 25  18]]
---------------------lr----------------
0.197183098592
[[136  21]
 [ 36   7]]
------iFold:2---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.43
[[ 43 114]
 [  0  43]]
---------------------rf----------------
0.324324324324
[[138  19]
 [ 31  12]]
---------------------lr----------------
0.521008403361
[[112  45]
 [ 12  31]]
------iFold:3---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.351464435146
[[  2 155]
 [  0  42]]
---------------------rf----------------
0.34693877551
[[118  39]
 [ 25  17]]
---------------------lr----------------
0.436363636364
[[70 87]
 [ 6 36]]
------iFold:4---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[157   0]
 [ 42   0]]
---------------------rf----------------
0.0
[[157   0]
 [ 42   0]]
---------------------lr----------------
0.148148148148
[[149   8]
 [ 38   4]]
------iFold:5---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.357446808511
[[  6 151]
 [  0  42]]
---------------------rf----------------
0.434782608696
[[127  30]
 [ 22  20]]
---------------------lr----------------
0.397260273973
[[82 75]
 [13 29]]
------iFold:6---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0444444444444
[[155   2]
 [ 41   1]]
---------------------rf----------------
0.465753424658
[[143  14]
 [ 25  17]]
---------------------lr----------------
0.420454545455
[[60 97]
 [ 5 37]]
------iFold:7---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[157   0]
 [ 42   0]]
---------------------rf----------------
0.0
[[157   0]
 [ 42   0]]
---------------------lr----------------
0.0416666666667
[[152   5]
 [ 41   1]]
------iFold:8---------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


---------------------svm----------------
0.0
[[156   0]
 [ 42   0]]
---------------------rf----------------
0.0
[[156   0]
 [ 42   0]]
---------------------lr----------------
0.127659574468
[[154   2]
 [ 39   3]]
------iFold:9---------------------


ValueError: The number of classes has to be greater than one; got 1