# 3. Classifier Performance on Gold-Standard

Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University

Reference codes (alphabetically):

## 3.1. Loading packages and creating folder

In [6]:
# general
import csv
import pandas as pd
import pickle as pkl
import time

# classification
from BOW import *
from sklearn.metrics import confusion_matrix

## 3.2. Gold-standard creator

In [4]:
#--------------------------------------------------------------
def NoisyLabelExtractor(company):
    
    # extracts info and pre-process text
    
    # timing (start)
    start_time = time.time()

    # open and read file (company)
    filename_sup='Dataset_Twitter_Clean_03/'+company[0]+'.pkl'
    df_sup = pd.read_pickle(filename_sup)
    print(company[0])
    
    df_sup = df_sup[['snowball_stems','has_happy','has_sad']]
 
    # open and read file ()
    filename='Dataset_NoisyLabels/'+company[0][:2]+'_NoisyLabels.pkl'
    df = pd.read_pickle(filename)
    
    df = df.append(df_sup, ignore_index = True)

    # save
    df.to_pickle(filename)
    
    # timing (end)
    t_sec = round(time.time() - start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    print('------------------')

In [None]:
#--------------------------------------------------------------
my_companies = [['br_embraer','@EMBRAER','#EMBRAER'],
                ['br_americanas','@LOJASAMERICANAS','#LOJASAMERICANAS'],
                ['br_pontofrio','@PONTOFRIO','#PONTOFRIO'],
                ['br_petrobras','@PETROBRAS','#PETROBRAS'],
                ['br_bradesco','@BRADESCO','#BRADESCO'],
                ['br_itau','@ITAU','#ITAU'],
                ['br_renner','@LOJAS_RENNER','#RENNER'],
                ['br_gol','@VOEGOLOFICIAL','#VOEGOL'],
                ['br_magazineluiza','@MAGAZINELUIZA','#MAGALU'],
                ['br_valor','@VALORECONOMICO','#VALORECONOMICO'],
                ['us_abercrombie','@ABERCROMBIE','#ABERCROMBIE'],
                ['us_boeing','@BOEING','#BOEING'],
                ['us_beyondmeat','@BEYONDMEAT','#BEYONDMEAT'],
                ['us_morganstanley','@MORGANSTANLEY','#MORGANSTANLEY'],
                ['us_jpmorgan','@JPMORGAN','#JPMORGAN'],
                ['us_exxonmobil','@EXXONMOBIL','#EXXON'],
                ['us_americanair','@AMERICANAIR','#AMERICANAIRLINES'],
                ['us_cocacola','@COCACOLA','#COCACOLA'],
                ['us_tesla','@TESLA','#TESLA'],
                ['us_wsj','@WSJ','#WSJ']]

#--------------------------------------------------------------
# create folders
df = pd.DataFrame(columns = ['snowball_stems','has_happy','has_sad'])
# english
filename='Dataset_NoisyLabels/us_NoisyLabels.pkl'
df.to_pickle(filename)
# portuguese
filename='Dataset_NoisyLabels/br_NoisyLabels.pkl'
df.to_pickle(filename)

#--------------------------------------------------------------
print('Getting Noisy Labels - PT')
for company in my_companies:
    if company[0][:2]=='br':
        NoisyLabelExtractor(company)

print('Getting Noisy Labels - EN')
for company in my_companies:
    if company[0][:2]=='us':
        NoisyLabelExtractor(company)

## 3.3. Performance Evaluation

In [None]:
files = [['Classifiers/English/','Dataset_NoisyLabels/us_NoisyLabels.pkl'],
         ['Classifiers/Portuguese/','Dataset_NoisyLabels/br_NoisyLabels.pkl']]

for file in files:
    
    # name
    folder=file[0]
    filename=file[1]

    # load data
    df = pd.read_pickle(filename)
    df = df[['snowball_stems','has_happy','has_sad']]

    # Filter (excludent) noisy labels
    print(df.shape)
    df.has_happy = df.has_happy.clip(0,1)
    df.has_sad = df.has_sad.clip(0,1)
    df = df[df.has_happy != df.has_sad]
    print(df.shape)

    # filter columns
    df = df[['snowball_stems','has_happy']]
    df.columns = ['snowball_stems','sentiment_pos']

    # balanced ?
    print('balanced?')
    print('Sentiments:',df.sentiment_pos.unique())
    print('Positive (1):',sum(df.sentiment_pos==1))
    print('Negative (0):',sum(df.sentiment_pos==0))
    if sum(df.sentiment_pos==1) != sum(df.sentiment_pos==0):
        sup = min(sum(df.sentiment_pos==1),sum(df.sentiment_pos==0))
        df_pos = df[df.sentiment_pos==1].sample(sup)
        df_neg = df[df.sentiment_pos==0].sample(sup)
        df = pd.concat([df_pos,df_neg]).reset_index(drop=True)
        print('balanced')    
        print('Positive (1):',sum(df.sentiment_pos==1))
        print('Negative (0):',sum(df.sentiment_pos==0))

    # bag_of_words
    lfeats = label_feats_from_corpus(df.snowball_stems,df.sentiment_pos,bag_of_words)
    train_feats, test_feats = split_label_feats(lfeats, split=0.0)
    print('Test set size:',len(test_feats))
    X_test = [test_feat[0] for test_feat in test_feats]
    y_test = [test_feat[1] for test_feat in test_feats]
    
    # Load classifiers
    
    # all words
    skc_logistic_all = pkl.load(open(folder+'skc_logistic_all.sav', 'rb'))
    skc_nb_mult_all = pkl.load(open(folder+'skc_nb_mult_all.sav', 'rb'))
    skc_nb_bernoulli_all = pkl.load(open(folder+'skc_nb_bernoulli_all.sav', 'rb'))
    #skc_nb_gaussian_all = pkl.load(open(folder+'skc_nb_gaussian_all.sav', 'rb'))
    skc_svc_linear_all = pkl.load(open(folder+'skc_svc_linear_all.sav', 'rb'))
    #skc_svc_nu_all = pkl.load(open(folder+'skc_svc_nu_all.sav', 'rb'))

    # info words
    skc_logistic_inf = pkl.load(open(folder+'skc_logistic_inf.sav', 'rb'))
    skc_nb_mult_inf = pkl.load(open(folder+'skc_nb_mult_inf.sav', 'rb'))
    skc_nb_bernoulli_inf = pkl.load(open(folder+'skc_nb_bernoulli_inf.sav', 'rb'))
    #skc_nb_gaussian_inf = pkl.load(open(folder+'skc_nb_gaussian_inf.sav', 'rb'))
    skc_svc_linear_inf = pkl.load(open(folder+'skc_svc_linear_inf.sav', 'rb'))
    #skc_svc_nu_inf = pkl.load(open(folder+'skc_svc_nu_inf.sav', 'rb'))

    # voting
    mv_classifier = pkl.load(open(folder+'mv_classifier.sav', 'rb'))
    
    est_classifiers = [['Logistic - All words',skc_logistic_all],
                       ['NB-Multinomial - All words',skc_nb_mult_all],
                       ['NB-Bernoulli - All words',skc_nb_bernoulli_all],
                       ['SVC-Linear - All words',skc_svc_linear_all],
                       ['Logistic - Info words',skc_logistic_inf],
                       ['NB-Multinomial - Info words',skc_nb_mult_inf],
                       ['NB-Bernoulli - Info words',skc_nb_bernoulli_inf],
                       ['SVC-Linear - Info words',skc_svc_linear_inf],
                       ['Max-Vote',mv_classifier]]

    with open(folder+'performance_NL_'+time.strftime('%Y-%m-%d_%H-%M',time.gmtime())+'.csv', 'w', encoding="utf-8") as csvFile:

        csvWriter = csv.writer(csvFile)
        csvWriter.writerow('Positive (1): '+str(sum(df.sentiment_pos==1)))
        csvWriter.writerow('Negative (0): '+str(sum(df.sentiment_pos==0)))

        for est_classifier in est_classifiers:
            
            print(est_classifier[0])

            #calculate            
            predicted_classes=est_classifier[1].classify_many(X_test)
            
            # confusion matrix
            cm = confusion_matrix(y_test,predicted_classes)
            TN, FP, FN, TP = cm.flatten()
            total = TN+FP+FN+TP

            # class 1
            prec1 = TP / (TP+FP)
            reca1 = TP / (TP+FN)
            fone1 = 2*(prec1*reca1)/(prec1+reca1)
            # class 0
            prec0 = TN / (TN+FN)
            reca0 = TN / (TN+FP)
            fone0 = 2*(prec0*reca0)/(prec0+reca0)

            # global / weighted
            accuw = TP/total +TN/total
            precw = prec0*(TN+FP)/(total) + prec1*(TP+FN)/(total)
            recaw = reca0*(TN+FP)/(total) + reca1*(TP+FN)/(total)
            fonew = fone0*(TN+FP)/(total) + fone1*(TP+FN)/(total)

            # list
            sup = [est_classifier[0], TN, FP, FN, TP, prec1, reca1, fone1, prec0, reca0, fone0, precw, recaw, fonew, accuw]
            
            # write in csv
            csvWriter.writerow(sup)