In [10]:
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import os
import re
from os import listdir
from os.path import isfile, join
import sklearn
import nltk
from nltk.corpus import stopwords 

from sklearn.naive_bayes import GaussianNB

In [2]:
INPUT_DIR = '../data/input/groundtruth/'
OUTPUT_DIR = '../data/input/groundtruth/'

CORPUS_DIR = os.path.join(OUTPUT_DIR, 'corpus')
VECTORIZED_DIR = os.path.join(OUTPUT_DIR, 'vectorized_trainset')

text_col_header = 'text'
label_col_header = 'label'

### Reading test and train data to keep them ready for testing

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Read Train data
ip_train_file = os.path.join(INPUT_DIR, 'speechact_train.csv')
df_train = pd.read_csv(ip_train_file)
df_train = df_train.astype({text_col_header: str, label_col_header: int})

# Read Test data
ip_test_file = os.path.join(INPUT_DIR, 'speechact_test.csv')
df_test = pd.read_csv(ip_test_file)
df_test = df_test.astype({text_col_header: str, label_col_header: int})

# Naive Bayes

Requirements
<li> Input - Vectorized Data train </li>
<li> Input - Vectorized Label train </li>

### Counter Vector with stopwords

In [4]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_stopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_stopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Train and Test the model

In [5]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_stopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [7]:
# Run Naive Bayes
estimator = GaussianNB()
estimator.fit(data_train.toarray(), np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test.toarray())

#### Performance metrics

In [9]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train.toarray(), df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test.toarray(), df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.8539583277177283
Test Accuracy 0.6958490566037736
F1 macro Score:  0.6789368997694899
              precision    recall  f1-score   support

           0       0.81      0.75      0.78      3911
           1       0.66      0.45      0.54      2908
           2       0.60      0.89      0.72      2456

    accuracy                           0.70      9275
   macro avg       0.69      0.70      0.68      9275
weighted avg       0.71      0.70      0.69      9275



### Counter Vector with no stopwords

In [11]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_nostopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_nostopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Train and Test the model

In [12]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_nostopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [13]:
# Run Naive Bayes
estimator = GaussianNB()
estimator.fit(data_train.toarray(), np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test.toarray())

#### Performance metrics

In [15]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train.toarray(), df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test.toarray(), df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.8534461845332758
Test Accuracy 0.6953099730458221
F1 macro Score:  0.6783367302233391
              precision    recall  f1-score   support

           0       0.81      0.75      0.78      3911
           1       0.66      0.45      0.54      2908
           2       0.60      0.89      0.72      2456

    accuracy                           0.70      9275
   macro avg       0.69      0.70      0.68      9275
weighted avg       0.71      0.70      0.69      9275



### Counter Vector with minimum occurences of words (min_occurences=2)

In [16]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_minoccurences_2.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_minoccurences_2.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Train and Test the model

In [17]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_minoccurences_2.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [18]:
# Run Naive Bayes
estimator = GaussianNB()
estimator.fit(data_train.toarray(), np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test.toarray())

#### Performance metrics

In [19]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train.toarray(), df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test.toarray(), df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.7770020755276422
Test Accuracy 0.6666307277628032
F1 macro Score:  0.6510888209734773
              precision    recall  f1-score   support

           0       0.84      0.69      0.76      3911
           1       0.62      0.43      0.51      2908
           2       0.55      0.91      0.69      2456

    accuracy                           0.67      9275
   macro avg       0.67      0.68      0.65      9275
weighted avg       0.69      0.67      0.66      9275

