In [1]:
#importing modules
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
import random, arff
import os, sys, subprocess

In [2]:
file_name = os.listdir(".")
#number of labels
col = list(range(0, 54))

In [3]:
#importing the files
test = pd.read_csv('Enron_a_test1.arff',sep=',',header=None).iloc[:,col]
BR   = pd.read_csv('BR-Enron_a_test1.arff',sep=',',header=None).iloc[:,col]
CC   = pd.read_csv('CC-Enron_a_test1.arff',sep=',',header=None).iloc[:,col]
CDN  = pd.read_csv('CDN-Enron_a_test1.arff',sep=',',header=None).iloc[:,col]
CDT  = pd.read_csv('CDT-Enron_a_test1.arff',sep=',',header=None).iloc[:,col]
CT   = pd.read_csv('CT-Enron_a_test1.arff',sep=',',header=None).iloc[:,col]

In [4]:
if True:
    clf_name = (BR,CC,CDN,CDT,CT,test)
    for name in clf_name:
        name.columns = col

In [5]:
def pre_cal(y_true, y_pred, print_all = False):
    if(y_true.shape != y_pred.shape):
        print("Wrong y_preds matrics!")

    real_pos = real_neg = pred_pos = pred_neg  = true_pos = true_neg = []

    for i in range(y_true.shape[0]):
        # real values - RP and RN
        real_pos = np.asarray(np.append(real_pos,np.logical_and(y_true[i], y_true[i]).sum()), dtype=np.int64).reshape(-1,1)
        real_neg = np.asarray(np.append(real_neg,np.logical_and(np.logical_not(y_true[i]),np.logical_not(y_true[i])).sum()), dtype=np.int64).reshape(-1,1)

        # y_pred values - PP and PN
        pred_pos = np.asarray(np.append(pred_pos,np.logical_and(y_pred[i], y_pred[i]).sum()),dtype=np.int64).reshape(-1,1)
        pred_neg = np.asarray(np.append(pred_neg,np.logical_and(np.logical_not(y_pred[i]), np.logical_not(y_pred[i])).sum()),dtype=np.int64).reshape(-1,1)

        # true labels - TP and TN
        true_pos = np.asarray(np.append(true_pos,np.logical_and(y_true[i], y_pred[i]).sum()),dtype=np.int64).reshape(-1,1)
        true_neg = np.asarray(np.append(true_neg,np.logical_and(np.logical_not(y_true[i]), np.logical_not(y_pred[i])).sum()),dtype=np.int64).reshape(-1,1)

    if print_all:
		# if print_all = True - it prints RP, RN, PP, PN, TP and TN
        result = np.concatenate((real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg), axis=1)
        print(result)

    return(real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg)

#function to resolve divide by zero error and accept the value 0 when divided by 0
def divideZero( value_a, value_b):
    with np.errstate(divide='ignore', invalid='ignore'):
        result = np.true_divide( value_a, value_b )
        result[ ~ np.isfinite( result )] = 0
    return result

def accuracy(y_true, y_pred):
    #return the accuracy - example based
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = (true_pos + true_neg)/(pred_pos + pred_neg)
    score = np.mean(score)
    return score


def precision(y_true, y_pred):
    #return precision - example based
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = divideZero(true_pos, pred_pos)
    score = np.mean(score)
    return score

def recall(y_true, y_pred):
    #return precision - example based
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = divideZero(true_pos, real_pos)
    score = np.mean(score)
    return score


def fscore(y_true, y_pred,beta = 1):
	#return f(beta)score - example based : default beta value is 1
    prec, rec = precision(y_true, y_pred), recall(y_true, y_pred)
    beta_val = beta*beta
    score = ((1+beta_val)*(prec*rec))/(beta_val*(prec+rec))
    return score


def hamloss(y_true, y_pred):
	#return hamming loss - example based
    hamloss = list()
    for i in range(y_true.shape[0]):
        hamloss = np.asarray(np.append(hamloss,np.logical_xor(y_true[i], y_pred[i]).sum()), dtype=np.int64).reshape(-1,1)
    score = (hamloss.sum())/((y_true.shape[0])*(y_true.shape[1]))
    return score


def subset(y_true, y_pred):
	#return subset accuracy - example based
    subset_matrix = list()
    for i in range(y_true.shape[0]):
        subset_matrix = np.asarray(np.append(subset_matrix, np.array_equal(y_true[i],y_pred[i])), dtype=np.int64).reshape(-1,1)
    score = (subset_matrix.sum())/((y_true.shape[0])*(y_true.shape[1]))
    return score

def zeroloss(y_true, y_pred):
    #return new array with removed element having all zero in y_true
    condition = list()
    index = list()
    for i in range(y_true.shape[0]):
        new_true = new_pred = list()
        condition = np.logical_and(y_true[i],y_true[i]).sum()
        if (condition==0):
            index = np.asarray(np.append(index,i), dtype = np.int64)

        new_true = np.delete(y_true,index, axis = 0)
        new_pred = np.delete(y_pred,index, axis = 0)
    return new_true, new_pred

def microprecision(y_true, y_pred):
    #return micro-precision
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = true_pos.sum()/pred_pos.sum()
    return score

def microrecall(y_true, y_pred):
    #return micro-recall
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = true_pos.sum()/real_pos.sum()
    return score

def microfscore(y_true, y_pred,beta = 1):
    #return micro-fscore
    prec, rec = microprecision(y_true, y_pred), microrecall(y_true, y_pred)
    beta_val = beta*beta
    score = ((1+beta_val)*(prec*rec))/(beta_val*(prec+rec))
    return score

def macroprecision(y_true, y_pred):
    #return macro-precision
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = divideZero(true_pos, pred_pos)
    return score

def macrorecall(y_true, y_pred):
    #return macro-recall
    real_pos, real_neg, pred_pos, pred_neg, true_pos, true_neg = pre_cal(y_true,y_pred)
    score = divideZero(true_pos, real_pos)
    return score

def macrofscore(y_true, y_pred,beta = 1):
    #return macro-fscore
    prec, rec = macroprecision(y_true, y_pred), macrorecall(y_true, y_pred)
    beta_val = beta*beta
    score = divideZero(((1+beta_val)*(prec*rec)),(beta_val*(prec+rec)))
    score = np.mean(score)
    return score

In [6]:
def calculate_all(np_test, np_pred, output):
    value = list()
    value.append(accuracy(np_test,np_pred))
    value.append(precision(np_test,np_pred))
    value.append(recall(np_test,np_pred))
    value.append(fscore(np_test,np_pred))
    value.append(hamloss(np_test,np_pred))
    value.append(subset(np_test,np_pred))
    value.append(microfscore(np_test,np_pred))
    value.append(macrofscore(np_test,np_pred))
    output.append(value)
    if False:
        print("Accuracy : {0:.4f}".format(value[2]))
        print("Precision: {0:.4f}".format(value[3]))
        print("Recall   : {0:.4f}".format(value[4]))
        print("F1-Score : {0:.4f}".format(value[5]))
        print("HammingL : {0:.4f}".format(value[6]))
        print("Subset   : {0:.4f}".format(value[7]))
        print("Micro - F1-Score : {0:.4f}".format(value[8]))
        print("Macro - F1-Score : {0:.4f}".format(value[9]))
        print("----------------------------------")
    del value
    return(output)

In [7]:
label_name = list()
for x in range(len(col)):
    label_name.append(pd.DataFrame(pd.concat([BR[x],CC[x],CDN[x],CDT[x],CT[x]], axis=1)))

In [8]:
test_name = list()
for x in range(len(col)):
    test_name.append(test[x])

In [9]:
kf = KFold(n_splits=5, random_state= 42, shuffle= True) 

def implement_kfold(label_,test_):
    output = list()
    clf = LogisticRegression(C=1e5)
    X = label_.values
    y = test_.values
    y.resize(len(y))
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        #test-train split for K- Fold 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
   
        #train the clasifier
        clf.fit(X_train,y_train)
        #predict the result
        y_pred = clf.predict(X_test)
        if False:
            print("-------------------------------")
            print("X_train: {}, y_train: {}".format(len(X_train),len(y_train)))
            print("X_test : {}, y_test : {}".format(len(X_test),len(y_test)))
            print("y_pred: {}".format(len(y_pred)))
            print("-------------------------------")
        y_test.resize(len(y_test),1)
        y_pred.resize(len(y_pred),1)
        #get the score
        calculate_all(y_test,y_pred,output)
    del clf
    data = pd.DataFrame(output)
    return(data.mean())

In [10]:
result = list()
for x in range(len(col)):
    list_ = [2,8,36,45,47,51,52]
    if x in list_:
        result.append([0,0,0,0,0,0,0,0])
        continue
    result.append(implement_kfold(label_name[x],pd.DataFrame(test_name[x])))



In [11]:
column = ['accuracy','precision','recall','f-score','hamloss','subset','microf','macrof']
df = pd.DataFrame(np.asarray(result), columns= column)
df.head(53)

Unnamed: 0,accuracy,precision,recall,f-score,hamloss,subset,microf,macrof
0,0.99469,0.0,0.0,,0.00531,0.99469,,0.0
1,0.970067,0.0,0.0,,0.029933,0.970067,,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.982379,0.0,0.0,,0.017621,0.982379,,0.0
4,0.943642,0.003524,0.003524,0.008811,0.056358,0.943642,0.253968,0.003524
5,0.947213,0.014097,0.014097,0.017621,0.052787,0.947213,0.404167,0.014097
6,0.765704,0.441764,0.441764,0.441764,0.234296,0.765704,0.789109,0.441764
7,0.977053,0.0,0.0,,0.022947,0.977053,,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.985903,0.0,0.0,,0.014097,0.985903,,0.0


In [12]:
df.to_csv('Stacking.csv')

In [21]:
result = pd.DataFrame(df.mean())
result.to_csv('Result.csv')
print(result)

                  0
accuracy   0.823215
precision  0.027187
recall     0.027187
f-score    0.063923
hamloss    0.047155
subset     0.823215
microf     0.343171
macrof     0.027187
