# Binary Classification of Comments   
## <div> Vassilis Panagakis </div>

In [25]:
import pandas as pd
import numpy as np
import nltk
import re
import warnings

from nltk.corpus import stopwords
from sklearn import metrics

warnings.filterwarnings('ignore')  

## Load Data

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#load train and test sets
train = pd.read_csv("gdrive/My Drive/Colab Notebooks/data/train.csv")
test = pd.read_csv("gdrive/My Drive/Colab Notebooks/data/impermium_verification_set.csv", usecols=['Date','Comment'])

In [4]:
#replace train set's NaN values of column 'Date'
train['Date'].fillna('00000000000000Z', inplace=True)

##### Display some samples of our initial data

In [5]:
train.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,00000000000000Z,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,00000000000000Z,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [6]:
test.head()

Unnamed: 0,Date,Comment
0,20120603163526Z,"""like this if you are a tribe fan"""
1,20120531215447Z,"""you're idiot......................."""
2,20120823164228Z,"""I am a woman Babs, and the only ""war on women..."
3,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F..."
4,20120602223825Z,"""haha green me red you now loser whos winning ..."


In [8]:
#get test set labels into a dataframe
test_Y = pd.read_csv("gdrive/My Drive/Colab Notebooks/data/impermium_verification_labels.csv", usecols=['Insult'])

In [9]:
#create numpy arrays for sets' labels
train_y = np.asarray(train['Insult'].tolist()) 
test_y = np.asarray(test_Y['Insult'].tolist())

### Data Pre-processing

In [10]:
#function that removes all @mentions, links and non alphabetic strings 
def clean_content(text):
    
    text = re.sub(r'@[A-Za-z0-9_]+', '', text) #remove text with @ prefix
    text = re.sub(r'http\S+', '', text) #remove text with http prefix (links)  
    text = re.sub(r'\\\w+', '', str(text)) #remove text after backslash
    text = re.sub(r'\b\w{1,2}\b', '', text) #remove text containing 2 or less characters
    
    text =  ''.join(ch for ch in text if ch.isalpha() or ch == ' ')
    
    text = text.lower() #convert text into lowercase
    
    return text

In [11]:
#create a column for each set containing the data of the first stage of processing
for index, row in train.iterrows():
    train.loc[index,'ProcessedComment'] = clean_content(train.loc[index,'Comment'])

for index, row in test.iterrows():
    test.loc[index,'ProcessedComment'] = clean_content(test.loc[index,'Comment'])

##### Display some samples of our processed data

In [12]:
train.head()

Unnamed: 0,Insult,Date,Comment,ProcessedComment
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",really don understand your point seems that ...
2,0,00000000000000Z,"""A\\xc2\\xa0majority of Canadians can and has ...",canadians can and has been wrong before now ...
3,0,00000000000000Z,"""listen if you dont wanna get married to a man...",listen you dont wanna get married man wom...
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",chi giang ...


In [13]:
test.head()

Unnamed: 0,Date,Comment,ProcessedComment
0,20120603163526Z,"""like this if you are a tribe fan""",like this you are tribe fan
1,20120531215447Z,"""you're idiot.......................""",you idiot
2,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",woman babs and the only war women see co...
3,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",wow you benefitted many wins this year from ...
4,20120602223825Z,"""haha green me red you now loser whos winning ...",haha green red you now loser whos winning now...


# <h1><center>CLASSIFICATION</center></h1>

## Word Counts (Basic)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

#transorm comments to word vectors (pre-lemmatized, unigrams, stopwords not removed)
countVectorizer = CountVectorizer(max_df=0.99, min_df=1, max_features=400, ngram_range=(1,1), analyzer='word')

In [15]:
bow = countVectorizer.fit_transform(train['ProcessedComment'])

vectors = []
for v in bow.toarray():
    vectors.append(v)

#create a column in train set with comments' words counts   
train['WordVecs'] = pd.Series(vectors,index=train.index)

#apply transformation to test set
bow1 = countVectorizer.transform(test['ProcessedComment'])

vectors1 = []
for v in bow1.toarray():
    vectors1.append(v)

#create a column in test set with comments' words counts   
test['WordVecs'] = pd.Series(vectors1,index=test.index)

In [16]:
#create numpy arrays containing word vectors
train_bow = np.asarray(train['WordVecs'].tolist())
test_bow = np.asarray(test['WordVecs'].tolist())

## Naive Bayes

In [17]:
#create a table to display the accuracy and F1-score of the Naive Bayes Classifiers 
nb_dic = {'GNB':['-','-','-'], 'MNB':['-','-','-'], 'MNB-LEM':['-','-','-'], 'MNB-SW':['-','-','-'], 
          'MNB-BG':['-','-','-'], 'MNB-LS':['-','-','-']}

nb_df = pd.DataFrame.from_dict(nb_dic, orient='index', columns=['Accuracy','F1_Macro','F1_Weighted'])

### Gaussian Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

#Normal (Gaussian) Naive Bayes Classifier using word counts features without lemmatization, stopwords removal and bigrams 
gnb_bow = GaussianNB() 
gnb_bow.fit(train_bow, train_y)
y_pred_gnb_bow = gnb_bow.predict(test_bow) #prediction on test set

nb_df.loc['GNB','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_gnb_bow) * 100)
nb_df.loc['GNB','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_gnb_bow, average='macro') * 100)
nb_df.loc['GNB','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_gnb_bow, average='weighted') * 100)

### Multinomial Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB

#Multinomial Naive Bayes Classifier using word counts features 
#without lemmatization, stopwords removal, bigrams and Laplace Smoothing (Laplace smoothing parameter ~= 0)
mnb_bow = MultinomialNB(alpha=1e-10) 
mnb_bow.fit(train_bow, train_y)
y_pred_mnb_bow = mnb_bow.predict(test_bow) #prediction on test set

nb_df.loc['MNB','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_mnb_bow) * 100)
nb_df.loc['MNB','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow, average='macro') * 100)
nb_df.loc['MNB','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow, average='weighted') * 100)

## Word Counts (Extended)

### Tokenization

In [20]:
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()

tokens = []
for index, row in train.iterrows(): #add each comment's token in a list
    toks = tknzr.tokenize(train.loc[index,'ProcessedComment'])
    tokens.append(toks)

#create a column in train set with comments' tokens 
train['Tokens'] = pd.Series(tokens,index=train.index)

In [21]:
tokens = []
for index, row in test.iterrows(): #add each comment's token in a list
    toks = tknzr.tokenize(test.loc[index,'ProcessedComment'])
    tokens.append(toks)
    
#create a column in test set with comments' tokens   
test['Tokens'] = pd.Series(tokens,index=test.index)

## I. Lemmatization

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [27]:
from nltk.stem import WordNetLemmatizer

#lemmatization on train set
lemmatizer = WordNetLemmatizer()

lemComment = []
for index, row in train.iterrows(): #add each token's lemma in a list
    lemmas = []
    for token in train.loc[index,'Tokens']:
        lemmas.append(lemmatizer.lemmatize(token)) #lemmatize each token
    lemComment.append(lemmas)

#replace train set's tokens with their lemmas
train.drop(['Tokens'],1,inplace=True)
train['Tokens'] = pd.Series(lemComment,index=train.index)

In [28]:
#lemmatization on test set
lemmatizer = WordNetLemmatizer()

lemComment = []
for index, row in test.iterrows(): #add each token's lemma in a list
    lemmas = []
    for token in test.loc[index,'Tokens']:
        lemmas.append(lemmatizer.lemmatize(token)) #lemmatize each token
    lemComment.append(lemmas)

#replace test set's tokens with their lemmas    
test.drop(['Tokens'],1,inplace=True)
test['Tokens'] = pd.Series(lemComment,index=test.index)

In [29]:
#create a new column in train and test sets with lemmatized comments 
train['LemComment']= train['Tokens'].str.join(' ') 
test['LemComment']= test['Tokens'].str.join(' ') 

In [30]:
#transorm comments to word vectors (lemmatized, unigrams, stopwords not removed)
countVectorizer = CountVectorizer(max_df=0.99, min_df=1, max_features=400, ngram_range=(1,1), analyzer='word')

In [31]:
bow = countVectorizer.fit_transform(train['LemComment'])

vectors = []
for v in bow.toarray():
    vectors.append(v)

#create a column in train set with lemmatized comments' words counts   
train['LemWordVecs'] = pd.Series(vectors,index=train.index)

#apply transformation to test set
bow1 = countVectorizer.transform(test['LemComment'])

vectors1 = []
for v in bow1.toarray():
    vectors1.append(v)

#create a column in test set with lemmatized comments' words counts   
test['LemWordVecs'] = pd.Series(vectors1,index=test.index)

In [32]:
#create numpy arrays containing word vectors after lemmatization
train_bow_lem = np.asarray(train['LemWordVecs'].tolist())
test_bow_lem = np.asarray(test['LemWordVecs'].tolist())

### Multinomial Naive Bayes

In [33]:
#Multinomial Naive Bayes Classifier using word counts features after lemmatization
#without stopwords removal, bigrams and Laplace Smoothing (Laplace smoothing parameter ~= 0)
mnb_bow_lem = MultinomialNB(alpha=1e-10) 
mnb_bow_lem.fit(train_bow_lem, train_y)
y_pred_mnb_bow_lem = mnb_bow_lem.predict(test_bow_lem) #prediction on test set

nb_df.loc['MNB-LEM','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_mnb_bow_lem) * 100)
nb_df.loc['MNB-LEM','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow_lem, average='macro') * 100)
nb_df.loc['MNB-LEM','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow_lem, average='weighted') * 100)

## II. Stopwords Removal

In [34]:
#transorm comments to word vectors (pre-lemmatized, unigrams, stopwords removed)
countVectorizer = CountVectorizer(max_df=0.99, min_df=1, max_features=400, stop_words='english', 
                                      ngram_range=(1,1), analyzer='word')

In [35]:
bow = countVectorizer.fit_transform(train['ProcessedComment'])

vectors = []
for v in bow.toarray():
    vectors.append(v)

#create a column in train set with (stopwords removed) comments' words counts   
train['SWordVecs'] = pd.Series(vectors,index=train.index)

#apply transformation to test set
bow1 = countVectorizer.transform(test['ProcessedComment'])

vectors1 = []
for v in bow1.toarray():
    vectors1.append(v)

#create a column in test set with (stopwords removed) comments' words counts   
test['SWordVecs'] = pd.Series(vectors1,index=test.index)

In [36]:
#create numpy arrays containing word vectors with removed stopwords
train_bow_sw = np.asarray(train['SWordVecs'].tolist())
test_bow_sw = np.asarray(test['SWordVecs'].tolist())

### Multinomial Naive Bayes

In [37]:
#Multinomial Naive Bayes Classifier using word counts features with stopwords removal
#without lemmatization, bigrams and Laplace Smoothing (Laplace smoothing parameter ~= 0)
mnb_bow_sw = MultinomialNB(alpha=1e-10) 
mnb_bow_sw.fit(train_bow_sw, train_y)
y_pred_mnb_bow_sw = mnb_bow_sw.predict(test_bow_sw) #prediction on test set

nb_df.loc['MNB-SW','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_mnb_bow_sw) * 100)
nb_df.loc['MNB-SW','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow_sw, average='macro') * 100)
nb_df.loc['MNB-SW','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow_sw, average='weighted') * 100)

## III. Bigrams

In [38]:
#transorm comments to word vectors (pre-lemmatized, bigrams, stopwords not removed)
countVectorizer = CountVectorizer(max_df=0.99, min_df=1, max_features=400, ngram_range=(2,2), analyzer='word')

In [39]:
bow = countVectorizer.fit_transform(train['ProcessedComment'])

vectors = []
for v in bow.toarray():
    vectors.append(v)

#create a column in train set with (bigram) comments' words counts   
train['BGWordVecs'] = pd.Series(vectors,index=train.index)

#apply transformation to test set
bow1 = countVectorizer.transform(test['ProcessedComment'])

vectors1 = []
for v in bow1.toarray():
    vectors1.append(v)

#create a column in test set with (bigram) comments' words counts   
test['BGWordVecs'] = pd.Series(vectors1,index=test.index)

In [40]:
#create numpy arrays containing word vectors with bigrams
train_bow_bg = np.asarray(train['BGWordVecs'].tolist())
test_bow_bg = np.asarray(test['BGWordVecs'].tolist())

### Multinomial Naive Bayes

In [41]:
#Multinomial Naive Bayes Classifier using word counts features with bigrams
#without lemmatization, stopwords removal and Laplace Smoothing (Laplace smoothing parameter ~= 0)
mnb_bow_bg = MultinomialNB(alpha=1e-10) 
mnb_bow_bg.fit(train_bow_bg, train_y)
y_pred_mnb_bow_bg = mnb_bow_bg.predict(test_bow_bg) #prediction on test set

nb_df.loc['MNB-BG','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_mnb_bow_bg) * 100)
nb_df.loc['MNB-BG','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow_bg, average='macro') * 100)
nb_df.loc['MNB-BG','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow_bg, average='weighted') * 100)

## IV. Laplace Smoothing

### Multinomial Naive Bayes

In [42]:
#Multinomial Naive Bayes Classifier using word counts features with Laplace Smoothing 
#without lemmatization, stopwords removal and bigrams
mnb_bow2 = MultinomialNB(alpha=1.0) 
mnb_bow2.fit(train_bow, train_y)
y_pred_mnb_bow2 = mnb_bow2.predict(test_bow) #prediction on test set

nb_df.loc['MNB-LS','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_mnb_bow2) * 100)
nb_df.loc['MNB-LS','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow2, average='macro') * 100)
nb_df.loc['MNB-LS','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_mnb_bow2, average='weighted') * 100)

##### Display some samples of our data with the new inserted word vectors columns

In [43]:
train.head()

Unnamed: 0,Insult,Date,Comment,ProcessedComment,WordVecs,Tokens,LemComment,LemWordVecs,SWordVecs,BGWordVecs
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[you, fuck, your, dad]",you fuck your dad,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",really don understand your point seems that ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[really, don, understand, your, point, seems, ...",really don understand your point seems that yo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,00000000000000Z,"""A\\xc2\\xa0majority of Canadians can and has ...",canadians can and has been wrong before now ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[canadian, can, and, ha, been, wrong, before, ...",canadian can and ha been wrong before now and ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0,00000000000000Z,"""listen if you dont wanna get married to a man...",listen you dont wanna get married man wom...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[listen, you, dont, wanna, get, married, man, ...",listen you dont wanna get married man woman do...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",chi giang ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[chi, giang, chu, khi, sau, chi, tranh, con]",chi giang chu khi sau chi tranh con,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [44]:
test.head()

Unnamed: 0,Date,Comment,ProcessedComment,WordVecs,Tokens,LemComment,LemWordVecs,SWordVecs,BGWordVecs
0,20120603163526Z,"""like this if you are a tribe fan""",like this you are tribe fan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[like, this, you, are, tribe, fan]",like this you are tribe fan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,20120531215447Z,"""you're idiot.......................""",you idiot,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[you, idiot]",you idiot,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",woman babs and the only war women see co...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[woman, babs, and, the, only, war, woman, see,...",woman babs and the only war woman see coming f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",wow you benefitted many wins this year from ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[wow, you, benefitted, many, win, this, year, ...",wow you benefitted many win this year from his...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,20120602223825Z,"""haha green me red you now loser whos winning ...",haha green red you now loser whos winning now...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[haha, green, red, you, now, loser, who, winni...",haha green red you now loser who winning now m...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Classification Accuracy & F-Measure

In [45]:
nb_df

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
GNB,48.501%,41.808%,41.092%
MNB,62.192%,62.123%,62.064%
MNB-LEM,62.416%,62.309%,62.237%
MNB-SW,65.861%,64.253%,64.528%
MNB-BG,59.642%,57.750%,58.074%
MNB-LS,62.506%,62.446%,62.392%


## Part of Speech 

In [48]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [49]:
from nltk import pos_tag

#function that returns a list [fractionNouns, fractionVerbs, fractionAdverbs, fractionAdjectives] for every comment
def posFractions(df):
    pos_tags = []

    for index, row in df.iterrows(): #repeat for every row in dataframe
        nounCount, verbCount, adverbCount, adjectiveCount = (0,0,0,0) #counter for each part of speech
        
        tokens = df.loc[index,'Tokens'] #get 'Tokens' column from dataframe
        pos_tokens = pos_tag(tokens) #find what part of speech is every token

        #increase counter by 1 for the correct part of speech
        for item in pos_tokens:
            if item[1] == 'NN' or item[1] == 'NNP':
                nounCount+=1
            if item[1] == 'VB' or item[1] == 'VBD' or item[1] == 'VBG' or item[1] == 'VBN':
                verbCount+=1
            if item[1] == 'RB' or item[1] == 'RBR' or item[1] == 'RBS':
                adverbCount+=1
            if item[1] == item[1] == 'JJ' or item[1] == 'JJR' or item[1] == 'JJS':
                adjectiveCount+=1
        
        #create zero list if there aren't any tokens
        if len(tokens) == 0:
            posList = [0.0, 0.0, 0.0, 0.0]
            pos_tags.append(posList)
            continue

        #create the fraction for every part of speech
        fracNoun = nounCount / len(tokens)
        fracVerb = verbCount / len(tokens)
        fracAdverb = adverbCount / len(tokens)
        fracAdjective = adjectiveCount / len(tokens)
        
        #create the part of speech final list
        posList = [round(fracNoun,3), round(fracVerb,3), round(fracAdverb,3), round(fracAdjective,3)]
        pos_tags.append(posList)
        
    return pos_tags

In [50]:
#create a column in train and test sets with comments' pos list  
train['pos'] = posFractions(train)
test['pos'] = posFractions(test)

##### Display some samples of our data after 'pos' column insertion

In [51]:
train.head()

Unnamed: 0,Insult,Date,Comment,ProcessedComment,WordVecs,Tokens,LemComment,LemWordVecs,SWordVecs,BGWordVecs,pos
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[you, fuck, your, dad]",you fuck your dad,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.25, 0.0, 0.0, 0.0]"
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",really don understand your point seems that ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[really, don, understand, your, point, seems, ...",really don understand your point seems that yo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.231, 0.154, 0.077, 0.077]"
2,0,00000000000000Z,"""A\\xc2\\xa0majority of Canadians can and has ...",canadians can and has been wrong before now ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[canadian, can, and, ha, been, wrong, before, ...",canadian can and ha been wrong before now and ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.26, 0.08, 0.12, 0.06]"
3,0,00000000000000Z,"""listen if you dont wanna get married to a man...",listen you dont wanna get married man wom...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[listen, you, dont, wanna, get, married, man, ...",listen you dont wanna get married man woman do...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.146, 0.146, 0.024, 0.171]"
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",chi giang ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[chi, giang, chu, khi, sau, chi, tranh, con]",chi giang chu khi sau chi tranh con,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.625, 0.0, 0.0, 0.125]"


In [52]:
test.head()

Unnamed: 0,Date,Comment,ProcessedComment,WordVecs,Tokens,LemComment,LemWordVecs,SWordVecs,BGWordVecs,pos
0,20120603163526Z,"""like this if you are a tribe fan""",like this you are tribe fan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[like, this, you, are, tribe, fan]",like this you are tribe fan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.167, 0.0, 0.0, 0.167]"
1,20120531215447Z,"""you're idiot.......................""",you idiot,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[you, idiot]",you idiot,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0]"
2,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",woman babs and the only war women see co...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[woman, babs, and, the, only, war, woman, see,...",woman babs and the only war woman see coming f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.292, 0.167, 0.0, 0.125]"
3,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",wow you benefitted many wins this year from ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[wow, you, benefitted, many, win, this, year, ...",wow you benefitted many win this year from his...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.417, 0.083, 0.0, 0.167]"
4,20120602223825Z,"""haha green me red you now loser whos winning ...",haha green red you now loser whos winning now...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[haha, green, red, you, now, loser, who, winni...",haha green red you now loser who winning now m...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.2, 0.1, 0.2, 0.2]"


In [53]:
#create numpy arrays containing part of speech lists
train_pos = np.asarray(train['pos'].tolist())
test_pos = np.asarray(test['pos'].tolist())

## Tf-Idf 

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

#apply tfidf transformation to comments (unigrams & bigrams, lemmatized, stopwords removed)
tfidfVectorizer = TfidfVectorizer(max_df=0.99, min_df=1, max_features=400, stop_words='english', 
                                      ngram_range=(1,2), analyzer='word')

In [55]:
tfidf = tfidfVectorizer.fit_transform(train['LemComment'])

vectors = []
for v in tfidf.toarray():
    vectors.append(v)

#create a column in train set with comments' words tf-idf counts transformation
train['tf-idf'] = pd.Series(vectors,index=train.index)

tfidf1 = tfidfVectorizer.transform(test['LemComment'])

vectors1 = []
for v in tfidf1.toarray():
    vectors1.append(v)

#create a column in test set with comments' words tf-idf counts transformation
test['tf-idf'] = pd.Series(vectors1,index=test.index)

##### Display some samples of our data after 'tf-idf' column insertion

In [56]:
train.head()

Unnamed: 0,Insult,Date,Comment,ProcessedComment,WordVecs,Tokens,LemComment,LemWordVecs,SWordVecs,BGWordVecs,pos,tf-idf
0,1,20120618192155Z,"""You fuck your dad.""",you fuck your dad,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[you, fuck, your, dad]",you fuck your dad,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.25, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ...",really don understand your point seems that ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[really, don, understand, your, point, seems, ...",really don understand your point seems that yo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.231, 0.154, 0.077, 0.077]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,0,00000000000000Z,"""A\\xc2\\xa0majority of Canadians can and has ...",canadians can and has been wrong before now ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[canadian, can, and, ha, been, wrong, before, ...",canadian can and ha been wrong before now and ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.26, 0.08, 0.12, 0.06]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0,00000000000000Z,"""listen if you dont wanna get married to a man...",listen you dont wanna get married man wom...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[listen, you, dont, wanna, get, married, man, ...",listen you dont wanna get married man woman do...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.146, 0.146, 0.024, 0.171]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",chi giang ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[chi, giang, chu, khi, sau, chi, tranh, con]",chi giang chu khi sau chi tranh con,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.625, 0.0, 0.0, 0.125]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [57]:
test.head()

Unnamed: 0,Date,Comment,ProcessedComment,WordVecs,Tokens,LemComment,LemWordVecs,SWordVecs,BGWordVecs,pos,tf-idf
0,20120603163526Z,"""like this if you are a tribe fan""",like this you are tribe fan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[like, this, you, are, tribe, fan]",like this you are tribe fan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.167, 0.0, 0.0, 0.167]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,20120531215447Z,"""you're idiot.......................""",you idiot,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[you, idiot]",you idiot,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",woman babs and the only war women see co...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[woman, babs, and, the, only, war, woman, see,...",woman babs and the only war woman see coming f...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.292, 0.167, 0.0, 0.125]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",wow you benefitted many wins this year from ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[wow, you, benefitted, many, win, this, year, ...",wow you benefitted many win this year from his...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.417, 0.083, 0.0, 0.167]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,20120602223825Z,"""haha green me red you now loser whos winning ...",haha green red you now loser whos winning now...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[haha, green, red, you, now, loser, who, winni...",haha green red you now loser who winning now m...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.2, 0.1, 0.2, 0.2]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [58]:
#create numpy arrays containing tf-idf vectors
train_tfidf = np.asarray(train['tf-idf'].tolist())
test_tfidf = np.asarray(test['tf-idf'].tolist())

### Compound Input Features Vector

In [59]:
#function that unifies two arrays into one
def unify_features(arr1, arr2):
    uni = []
    #unify the i-th element of arr1 with the corresponding element of arr2
    for i,j in enumerate(arr1):
        arr = np.append(arr1[i], arr2[i])
        uni.append(arr)
        
    return np.asarray(uni)

In [60]:
#unify part of speech and word tfidf transformation vectors into one
train_comb1 = unify_features(train_pos, train_tfidf)
test_comb1 = unify_features(test_pos, test_tfidf)

##  Part of Speech & Tfidf Unification

In [61]:
#create a table to display the accuracy and F1-score of Support Vector Machines and Random Forests Classifiers
comb_dic = {'SVM':['-','-','-'], 'RF':['-','-','-']}

comb_df = pd.DataFrame.from_dict(comb_dic, orient='index', columns=['Accuracy','F1_Macro','F1_Weighted'])

### Support Vector Machines

In [62]:
from sklearn.svm import SVC

#Support Vector Machines Linear Classifier using unified part of speech and word tfidf transformation features
svc_comb1 = SVC(C=100, kernel='linear', gamma=0.001)
svc_comb1.fit(train_comb1, train_y)
y_pred_svc_comb1 = svc_comb1.predict(test_comb1) #prediction on test set

comb_df.loc['SVM','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_svc_comb1) * 100)
comb_df.loc['SVM','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_svc_comb1, average='macro') * 100)
comb_df.loc['SVM','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_svc_comb1, average='weighted') * 100)

### Random Forests

In [63]:
from sklearn.ensemble import RandomForestClassifier

#Random Forests Classifier using unified part of speech and word tfidf transformation features
rf_comb1 = RandomForestClassifier(n_estimators=100, min_samples_split=8, min_samples_leaf=2)
rf_comb1.fit(train_comb1, train_y)
y_pred_rf_comb1 = rf_comb1.predict(test_comb1) #prediction on test set

comb_df.loc['RF','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_rf_comb1) * 100)
comb_df.loc['RF','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_rf_comb1, average='macro') * 100)
comb_df.loc['RF','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_rf_comb1, average='weighted') * 100)

## Classification Accuracy & F-Measure

In [64]:
comb_df

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
SVM,66.309%,63.635%,63.993%
RF,66.130%,62.446%,62.872%


##  Optimize Classification Results - Logistic Regression 

In [65]:
#create a table to display the accuracy and F1-score of Logistic Regression Classifiers
lr_dic = {'LR(pos_tfidf)':['-','-','-'], 'LR(best_bow)':['-','-','-']}

lr_df = pd.DataFrame.from_dict(lr_dic, orient='index', columns=['Accuracy','F1_Macro','F1_Weighted'])

### Part of Speech & Tfidf Unification (Same input vector used in SVM, RF Classifiers)

In [66]:
from sklearn.linear_model import LogisticRegression

#Logistic Regression Classifier using unified part of speech and word tfidf transformation features
lr_comb1 = LogisticRegression(C=1, class_weight='balanced', solver='lbfgs')

lr_comb1.fit(train_comb1, train_y)
y_pred_lr_comb1 = lr_comb1.predict(test_comb1) #prediction on test set

lr_df.loc['LR(pos_tfidf)','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_lr_comb1) * 100)
lr_df.loc['LR(pos_tfidf)','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_lr_comb1, average='macro') * 100)
lr_df.loc['LR(pos_tfidf)','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_lr_comb1, average='weighted') * 100)

### Word Counts (Best Parametres Combination)

In [67]:
#transorm comments to word vectors (lemmatized, unigrams & bigrams, stopwords removed)
countVectorizer = CountVectorizer(max_df=0.99, min_df=1, max_features=400, stop_words='english', 
                                      ngram_range=(1,2), analyzer='word')

In [68]:
bow = countVectorizer.fit_transform(train['LemComment'])

vectors = []
for v in bow.toarray():
    vectors.append(v)

#create a column in train set with lemmatized comments' words counts   
train['ExtWordVecs'] = pd.Series(vectors,index=train.index)

#apply transformation to test set
bow1 = countVectorizer.transform(test['LemComment'])

vectors1 = []
for v in bow1.toarray():
    vectors1.append(v)

#create a column in test set with lemmatized comments' words counts   
test['ExtWordVecs'] = pd.Series(vectors1,index=test.index)

In [69]:
#create numpy arrays containing word counts vectors
train_bow_ext = np.asarray(train['ExtWordVecs'].tolist())
test_bow_ext = np.asarray(test['ExtWordVecs'].tolist())

In [70]:
#Logistic Regression Classifier using word counts features
lr_bow_ext = LogisticRegression(C=1, class_weight='balanced', solver='lbfgs')

lr_bow_ext.fit(train_bow_ext, train_y)
y_pred_lr_bow_ext = lr_bow_ext.predict(test_bow_ext) #prediction on test set

lr_df.loc['LR(best_bow)','Accuracy'] = "%.3f%%" % (metrics.accuracy_score(test_y, y_pred_lr_bow_ext) * 100)
lr_df.loc['LR(best_bow)','F1_Macro'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_lr_bow_ext, average='macro') * 100)
lr_df.loc['LR(best_bow)','F1_Weighted'] = "%.3f%%" % (metrics.f1_score(test_y, y_pred_lr_bow_ext, average='weighted') * 100)

### Classification Accuracy & F-Measure

In [71]:
lr_df

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
LR(pos_tfidf),67.069%,66.852%,66.949%
LR(best_bow),68.635%,68.044%,68.202%


## Classifiers Final Accuracy & F-Measure Evaluation

In [72]:
#rename dataframe indecies to distinct the Classifiers in the final dataframe
new_nb_df = nb_df.rename(index={'GNB': 'GNB(bow)', 'MNB': 'MNB(bow)', 'MNB-LEM': 'MNB-LEM(bow)', 'MNB-SW': 'MNB-SW(bow)',
                                   'MNB-BG': 'MNB-BG(bow)', 'MNB-LS': 'MNB-LS(bow)'})
new_comb_df = comb_df.rename(index={'SVM': 'SVM(pos_tfidf)', 'RF': 'RF(pos_tfidf)'})

In [73]:
#concatenate all dataframes' rows to create the final dataframe
cl_df = pd.concat([new_nb_df, new_comb_df, lr_df])

cl_df

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
GNB(bow),48.501%,41.808%,41.092%
MNB(bow),62.192%,62.123%,62.064%
MNB-LEM(bow),62.416%,62.309%,62.237%
MNB-SW(bow),65.861%,64.253%,64.528%
MNB-BG(bow),59.642%,57.750%,58.074%
MNB-LS(bow),62.506%,62.446%,62.392%
SVM(pos_tfidf),66.309%,63.635%,63.993%
RF(pos_tfidf),66.130%,62.446%,62.872%
LR(pos_tfidf),67.069%,66.852%,66.949%
LR(best_bow),68.635%,68.044%,68.202%


### Sorted Classifiers from Best to Worst based on Classification Accuracy

In [74]:
sort_cl_df = cl_df.sort_values(by=['Accuracy'], ascending=False)

sort_cl_df

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
LR(best_bow),68.635%,68.044%,68.202%
LR(pos_tfidf),67.069%,66.852%,66.949%
SVM(pos_tfidf),66.309%,63.635%,63.993%
RF(pos_tfidf),66.130%,62.446%,62.872%
MNB-SW(bow),65.861%,64.253%,64.528%
MNB-LS(bow),62.506%,62.446%,62.392%
MNB-LEM(bow),62.416%,62.309%,62.237%
MNB(bow),62.192%,62.123%,62.064%
MNB-BG(bow),59.642%,57.750%,58.074%
GNB(bow),48.501%,41.808%,41.092%


### Worst Classification Accuracy Classifier

In [75]:
sort_cl_df.tail(1)

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
GNB(bow),48.501%,41.808%,41.092%


### Multinomial Naive Bayes Classifiers

In [76]:
sort_cl_df[sort_cl_df.index.str.startswith('M')]

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
MNB-SW(bow),65.861%,64.253%,64.528%
MNB-LS(bow),62.506%,62.446%,62.392%
MNB-LEM(bow),62.416%,62.309%,62.237%
MNB(bow),62.192%,62.123%,62.064%
MNB-BG(bow),59.642%,57.750%,58.074%


**The worst Classifier based on both classification accuracy and f1 score metrics on the test set is the first one we implemented, namely the Gaussian Naive Bayes Classifier, using word counts as features. That's expected, due to the deficient pre-processing on the data. Specifically, when GNB was executed we hadn't lemmatized our data or removed the stopwords, yet. <br> Moreover, as it turned out {GNB(bow)} isn't the most efficient, even, among the Naive Bayes Classifiers, as the Multinomial Naive Bayes Classifier {MNB(bow)} turned out to be the one, even before trying the additional cleansing on our data. <br> Concering the Multinomial Naive Bayes Classifiers we can distinct a 3% constant difference of the classification accuracy rate between the best MNB Classifier {MNB-SW(bow)} (the one where stopwords are removed) and the 3 intermediate MNB Classifiers, as well as between the intermediate MNB Classifiers and the worst MNB Classifier {MNB-BG(bow)} (the one where we use bigrams instead of unigrams). Same goes to both weighted and unweighted f1 scores metrics, with even bigger rate difference between the intermediate and the worst MNB Classifiers. <br> Having said that, we can verify the notion that removing the stopwords is a necessary strategy in every dataset cleansing and specially in our case it seems to lead to better classification results than lemmatization or data smoothing. <br> Finally, regarding the rest MNB Classifiers, it's worth mentioning that the lemmatization and Laplace Smoothing techniques had almost zero effect on the classification accuracy and f1 score metrics compared with our initial {MNB(bow)} Classifier.**

### Logistic Regression Classifiers

In [77]:
sort_cl_df.head(2)

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
LR(best_bow),68.635%,68.044%,68.202%
LR(pos_tfidf),67.069%,66.852%,66.949%


### Best Classification Accuracy Classifier

In [78]:
sort_cl_df.head(1)

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
LR(best_bow),68.635%,68.044%,68.202%


**The best Classifier based on both classification accuracy and f1 score metrics on the test set is the <br> Logistic Regression Classifier {LR(best_bow)}, using words counts as features. We produced the input vector by combining the best possible parametres (unigrams & bigrams, stopwords removal) of CountVectorizer and we used it to transform our (lemmatized) dataset. Logistic Regression Classifier is, generally, one of the most effective binary classifiers. That's expected, because Logistic Regression analysis' dependent variable is exclusively binary. Furthermore, the low correlations among the predictors is an important factor of the method's effectiveness. So it turned out in our classification problem, since the {LR(best_bow)} Classifier predicted the corrent label with the highest accuracy rate, close to 70%. <br> The only paradox we could point out concerning our two LR Classifiers is that {LR(pos_tfidf)}, which uses words tfidf transformation as part of his input feature vector, can't overcome the {LR(best_bow)} Classifier's classification accuracy, which, uses word counts as features. <br> The importance of this observation, results from the fact that most of the time the tf-idf transformation method produces better classification results than the word counts method. <br> Yet, it seems that doesn't apply in the Logistic Regression Classifiers of our paradigm and same goes to the f1 score metrics.**

### Observations on SVM & RF Classifiers

In [79]:
new_comb_df.sort_values(by=['Accuracy'], ascending=False)

Unnamed: 0,Accuracy,F1_Macro,F1_Weighted
SVM(pos_tfidf),66.309%,63.635%,63.993%
RF(pos_tfidf),66.130%,62.446%,62.872%


**Finally, it is important to appose our observations about the classification results of our Support Vector Machines and Random Forests Classifiers using the unification of part of speech fractions and words tf-idf transformation as features. <br> It's obvious that the compound input feature vector (pos-tfidf) has almost the same effect on both type of Classifiers as regards to the classification accuracy. It's clear that both Classifiers produce very satisfying classification results, that place them 3rd and 4th best, relatively, in the classification accuracy standings by a small rate difference compared to the results of the two LR Classifiers. In fact, we note that, even if the Random Forests Classifier is intrinsically suited for multiclass problems it produces good results for our two-class problem. <br> The above observations apply in the exact same way to the f1 score rates of our discussed Classifiers.**