# 3.1. Building the Classifiers (_EN = Base File)
 
Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University
 
Reference codes (alphabetically):
- Perkins, Jacob. Python 3 Text Processing with NLTK 3 Cookbook. Packt, 2014.
- https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
- https://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/
 
## Loading packages and hard data
 
The major steps of Text Classification with Scikit-Learn are:
     1. Creating training features (covered in the previous recipes).
     2. Choosing and importing an sklearn algorithm.
     3. Construct an SklearnClassifier class with the chosen algorithm.
     4. Train the SklearnClassifier class with your training features.

In [1]:
# general
import csv
import pandas as pd
import pickle as pkl
import time

# classification
from BOW import *
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC #, NuSVC #, SVC

In [2]:
# choosing the language
folder='Classifiers/English/'
filename='Dataset_Twitter_Clean_03/us_EnglishTweets.pkl'

# open and read file
df = pd.read_pickle(filename)

# overall description of the dataset
print('Shape:',df.shape)
print('Columns:',df.columns)

# balanced ?
print('balanced?')
print('Sentiments:',df.sentiment_pos.unique())
print('Positive (1):',sum(df.sentiment_pos==1))
print('Negative (0):',sum(df.sentiment_pos==0))

print()
if sum(df.sentiment_pos==1) != sum(df.sentiment_pos==0):
    sup = min(sum(df.sentiment_pos==1),sum(df.sentiment_pos==0))
    df_pos = df[df.sentiment_pos==1].sample(sup)
    df_neg = df[df.sentiment_pos==0].sample(sup)
    df = pd.concat([df_pos,df_neg]).reset_index(drop=True)
    print('balanced')    
    print('Positive (1):',sum(df.sentiment_pos==1))
    print('Negative (0):',sum(df.sentiment_pos==0))

FileNotFoundError: [Errno 2] No such file or directory: 'Dataset_Twitter_Clean_03/us_EnglishTweets.pkl'

## Building bags-of-words (all words vs most informative words)

- lfeats is a dict of lists of dicts
- lfeats = collection.defaultdict
- lfeats['neg'] = list
- lfeats['neg'][0] = dict: String: True

In [None]:
# features
features = df.snowball_stems

# -----------------------------------------------------------------
# dataset = all words

print('---------------------')
print('All words:')

# bag_of_words
lfeats_all = label_feats_from_corpus(features,
                                     df.sentiment_pos,
                                     bag_of_words)


train_feats_all, test_feats_all= split_label_feats(lfeats_all, split=0.995)

print('Training set size:',len(train_feats_all))
print('Test set size:',len(test_feats_all))
print('---------------------')

# -----------------------------------------------------------------
# dataset = informative words only

cutoff = 1.5

print('---------------------')
print('Informative words only:')

labeled_words = separate_words(features,
                               df.sentiment_pos)

high_info_words = set(high_information_words(labeled_words,min_score=cutoff))

def bag_of_words_in_set(words, goodwords=high_info_words):
    return bag_of_words(set(words) & set(goodwords))

lfeats_inf = label_feats_from_corpus(features,
                                     df.sentiment_pos,
                                     feature_detector=bag_of_words_in_set)

train_feats_inf, test_feats_inf = split_label_feats(lfeats_inf, split=0.995)

print('Training set size:',len(train_feats_inf))
print('Test set size:',len(test_feats_inf))
print('---------------------')
print('Words (all):',len(set(labeled_words[0][1]+labeled_words[1][1])))
print('Words (high info):',len(high_info_words))
print('---------------------')


# ### optimal cutoff = 1.5
# 
# print('Optimizing the number of informative words')
# 
# labeled_words = separate_words(features,
#                                df.sentiment_pos)
# 
# acc = []
# 
# 
# for i in range(12,31):
# 
#     high_info_words = set(high_information_words(labeled_words,min_score=i/10))
# 
#     def bag_of_words_in_set(words, goodwords=high_info_words):
#         return bag_of_words(set(words) & set(goodwords))
# 
#     lfeats_inf = label_feats_from_corpus(features,
#                                          df.sentiment_pos,
#                                          feature_detector=bag_of_words_in_set)
# 
#     train_feats_inf, test_feats_inf = split_label_feats(lfeats_inf, split=0.99)
#         
#     skc_logistic_inf = SklearnClassifier(LogisticRegression(solver='lbfgs',max_iter=2000))
#     skc_logistic_inf.train(train_feats_inf)
#     
#     acc.append(accuracy(skc_logistic_inf, test_feats_inf))
#     
#     print(i/10,accuracy(skc_logistic_inf, test_feats_inf),len(high_info_words))
# 
# cutoff = [i for i in range(12,31)][acc.index(max(acc))]/10
# 
# print('optimal cutoff:',cutoff)

## Running classifiers
- We do not compute the traditional SVC or the Decision Tree classifiers because they performed substantially worse and took longer to reach equilibrium in the book examples. Instead of the traditional SVC, we compute the linear SVC and the NuSVC, which obtained superior performance in the 'cookbook' text classification tasks.
 
### With all words (i.e. informative and non-informative)
- The SklearnClassifier converts NLTK feature dictionaries into into sklearn compatible feature vectors.

In [None]:
# Logistic Regression
skc_logistic_all = SklearnClassifier(LogisticRegression(solver='lbfgs',max_iter=2000))
skc_logistic_all.train(train_feats_all)
print('Logistic: done!')

# Logistic Regression CV
#skc_logistic_cv_all = SklearnClassifier(LogisticRegressionCV(solver='lbfgs',max_iter=2000))
#skc_logistic_cv_all.train(train_feats_all)
#print('Logistic-CV: done!')

# Naive Bayes Classifier for multivariate Multinomial models (i.e. occurrence counts)
skc_nb_mult_all = SklearnClassifier(MultinomialNB())
skc_nb_mult_all.train(train_feats_all)
print('NB-Mult: done!')

# Naive Bayes Classifier for multivariate Bernoulli models (i.e. binary response)
skc_nb_bernoulli_all= SklearnClassifier(BernoulliNB())
skc_nb_bernoulli_all.train(train_feats_all)
print('NB-Bernoulli: done!')

# Naive Bayes Classifier for Gaussian models
#skc_nb_gaussian_all= SklearnClassifier(GaussianNB())
#skc_nb_gaussian_all.train(train_feats_all)
#print('NB-Gaussian: done!')

# LinearSVC: Special Implementation of Support Vector Classifier with Linear Kernel
skc_svc_linear_all = SklearnClassifier(LinearSVC(max_iter=2000))
skc_svc_linear_all.train(train_feats_all)
print('Linear-SVC: done!')

# NuSVC: Support Vector Classifier with Restricted Number of Support Vectors
#skc_svc_nu_all = SklearnClassifier(NuSVC())
#skc_svc_nu_all.train(train_feats_all)
#print('done!')

## With informative words only
- If there are some words that are biased towards the 'pos' label, but still occur every now and then at the 'neg' label, this could cause some confusion in your algorithm. To correct this behavior, we use only the most informative words.
- "Its accuracy before was 86.4%, so we actually got a very slight decrease. In general, support vector machine and logistic regression-based algorithms will benefit less, or perhaps even be harmed, by pre-filtering the training features. This is because these algorithms are able to learn feature weights that correspond to the significance of each feature, whereas Naive Bayes algorithms do not."
- "A high information word is a word that is strongly biased towards a single classification label. The low information words are words that are common to all labels. It may be counter-intuitive, but eliminating these words from the training data can actually improve accuracy, precision, and recall. The reason this works is that using only high information words reduces the noise and confusion of a classifier's internal model. If all the words/features are highly biased one way or the other, it's much easier for the classifier to make a correct guess."
- The labeled test features = gold standard
- "The default score_fn is nltk.metrics.BigramAssocMeasures.chi_sq(), ff n_ii: This is the frequency of the word for the label ff n_ix: This is the total frequency of the word across all labels ff n_xi: This is the total frequency of all words that occurred for the label ff n_xx: This is the total frequency for all words in all labels"


In [None]:
# Logistic Regression
skc_logistic_inf = SklearnClassifier(LogisticRegression(solver='lbfgs',max_iter=2000))
skc_logistic_inf.train(train_feats_inf)
print('Logistic: done!')

# Logistic Regression CV
#skc_logistic_cv_inf = SklearnClassifier(LogisticRegressionCV(solver='lbfgs',max_iter=2000))
#skc_logistic_cv_inf.train(train_feats_inf)
#print('Logistic-CV: done!')

# Naive Bayes Classifier for multivariate Multinomial models (i.e. occurrence counts)
skc_nb_mult_inf = SklearnClassifier(MultinomialNB())
skc_nb_mult_inf.train(train_feats_inf)
print('NB-Mult: done!')

# Naive Bayes Classifier for multivariate Bernoulli models (i.e. binary response)
skc_nb_bernoulli_inf= SklearnClassifier(BernoulliNB())
skc_nb_bernoulli_inf.train(train_feats_inf)
print('NB-Bernoulli: done!')

# Naive Bayes Classifier for Gaussian models
#skc_nb_gaussian_inf= SklearnClassifier(GaussianNB())
#skc_nb_gaussian_inf.train(train_feats_inf)
#print('NB-Gaussian: done!')

# LinearSVC: Special Implementation of Support Vector Classifier with Linear Kernel
skc_svc_linear_inf = SklearnClassifier(LinearSVC(max_iter=2000))
skc_svc_linear_inf.train(train_feats_inf)
print('Linear-SVC: done!')

# NuSVC: Support Vector Classifier with Restricted Number of Support Vectors
#skc_svc_nu_inf = SklearnClassifier(NuSVC(max_iter=2000))
#skc_svc_nu_inf.train(train_feats_inf)
#print('done!')

## Combining Classifiers with Voting

In [None]:
mv_classifier = MaxVoteClassifier(skc_nb_mult_all,
                                  skc_nb_bernoulli_all,
                                  skc_logistic_all,
                                  skc_svc_linear_all,
                                  skc_nb_mult_inf,
                                  skc_nb_bernoulli_inf,
                                  skc_logistic_inf,
                                  skc_svc_linear_inf)

## Computing performances

In [None]:
# -----------------------------------------------------------------
# classifiers
est_classifiers = [['Logistic - All words',skc_logistic_all,test_feats_all],
                   ['NB-Multinomial - All words',skc_nb_mult_all,test_feats_all],
                   ['NB-Bernoulli - All words',skc_nb_bernoulli_all,test_feats_all],
                   ['SVC-Linear - All words',skc_svc_linear_all,test_feats_all],
                   ['Logistic - Info words',skc_logistic_inf,test_feats_inf],
                   ['NB-Multinomial - Info words',skc_nb_mult_inf,test_feats_inf],
                   ['NB-Bernoulli - Info words',skc_nb_bernoulli_inf,test_feats_inf],
                   ['SVC-Linear - Info words',skc_svc_linear_inf,test_feats_inf],
                   ['Max-Vote',mv_classifier,test_feats_all]]

# -----------------------------------------------------------------
# save
pkl.dump(skc_logistic_all, open(folder+'skc_logistic_all.sav', 'wb'))
#pkl.dump(skc_logistic_cv_all, open(folder+'skc_logistic_cv_all.sav', 'wb'))
pkl.dump(skc_nb_mult_all, open(folder+'skc_nb_mult_all.sav', 'wb'))
pkl.dump(skc_nb_bernoulli_all, open(folder+'skc_nb_bernoulli_all.sav', 'wb'))
#pkl.dump(skc_nb_gaussian_all, open(folder+'skc_nb_gaussian_all.sav', 'wb'))
pkl.dump(skc_svc_linear_all, open(folder+'skc_svc_linear_all.sav', 'wb'))
#pkl.dump(skc_svc_nu_all, open(folder+'skc_svc_nu_all.sav', 'wb'))
# save
pkl.dump(skc_logistic_inf, open(folder+'skc_logistic_inf.sav', 'wb'))
#pkl.dump(skc_logistic_CV_inf, open(folder+'skc_logistic_cv_inf.sav', 'wb'))
pkl.dump(skc_nb_mult_inf, open(folder+'skc_nb_mult_inf.sav', 'wb'))
pkl.dump(skc_nb_bernoulli_inf, open(folder+'skc_nb_bernoulli_inf.sav', 'wb'))
#pkl.dump(skc_nb_gaussian_inf, open(folder+'skc_nb_gaussian_inf.sav', 'wb'))
pkl.dump(skc_svc_linear_inf, open(folder+'skc_svc_linear_inf.sav', 'wb'))
#pkl.dump(skc_svc_nu_inf, open(folder+'skc_svc_nu_inf.sav', 'wb'))
# save
pkl.dump(mv_classifier, open(folder+'mv_classifier.sav', 'wb'))

# -----------------------------------------------------------------
# write down performance

with open(folder+'performance_in_similar_'+time.strftime('%Y-%m-%d_%H-%M',time.gmtime())+'.csv', 'w', encoding="utf-8") as csvFile:
    
    csvWriter = csv.writer(csvFile)
    
    for est_classifier in est_classifiers:
        
        X_test = [f[0] for f in est_classifier[2]]
        y_test = [f[1] for f in est_classifier[2]]
        
        predicted_classes=est_classifier[1].classify_many(X_test)

        # confusion matrix
        cm = confusion_matrix(y_test,predicted_classes)
        TN, FP, FN, TP = cm.flatten()
        total = TN+FP+FN+TP

        # class 1
        prec1 = TP / (TP+FP)
        reca1 = TP / (TP+FN)
        fone1 = 2*(prec1*reca1)/(prec1+reca1)
        # class 0
        prec0 = TN / (TN+FN)
        reca0 = TN / (TN+FP)
        fone0 = 2*(prec0*reca0)/(prec0+reca0)

        # global / weighted
        accuw = TP/total +TN/total
        precw = prec0*(TN+FP)/(total) + prec1*(TP+FN)/(total)
        recaw = reca0*(TN+FP)/(total) + reca1*(TP+FN)/(total)
        fonew = fone0*(TN+FP)/(total) + fone1*(TP+FN)/(total)

        # list
        sup = [est_classifier[0], TN, FP, FN, TP, prec1, reca1, fone1, prec0, reca0, fone0, precw, recaw, fonew, accuw]

        # write in csv
        csvWriter.writerow(sup)

## Basic Model Visualization
### Classifying a few tweets


In [None]:
# -----------------------------------------------------------------
# select one classifier
classifier = skc_logistic_inf
# two examples
print('valoriza ->', classifier.classify(bag_of_words(['valoriza'])))
probs = classifier.prob_classify(bag_of_words(['valoriza']))
print('Categories:',probs.samples())
print('Classification given:', probs.max())
print('Probability of being positive:',probs.prob(1))
print('Probability of being negative:',probs.prob(0))
print('Ratio:', probs.prob(1)/probs.prob(0))
print('\n-------------------------\n')
print('chateada ->', classifier.classify(bag_of_words(['chateada'])))
probs = classifier.prob_classify(bag_of_words(['chateada']))
print('Categories:',probs.samples())
print('Classification given:', probs.max())
print('Probability of being positive:',probs.prob(1))
print('Probability of being negative:',probs.prob(0))
print('Ratio:', probs.prob(1)/probs.prob(0))
print('\n-------------------------\n')

# -----------------------------------------------------------------
# most informative words

classifier = NaiveBayesClassifier.train(train_feats_all)
print("Classifier accuracy percent:",(accuracy(classifier, test_feats_all))*100)

# Show most informative features with ratios (ratio = p('pos')/p('neg') for a review of only one word)
print(classifier.show_most_informative_features(n=50))
print('\n-------------------------\n')