In [8]:
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import os
import re
from os import listdir
from os.path import isfile, join
import sklearn
import nltk
from nltk.corpus import stopwords 

from sklearn.neighbors import KNeighborsClassifier

In [9]:
INPUT_DIR = '../data/input/groundtruth/'
OUTPUT_DIR = '../data/input/groundtruth/'

CORPUS_DIR = os.path.join(OUTPUT_DIR, 'corpus')
VECTORIZED_DIR = os.path.join(OUTPUT_DIR, 'vectorized_trainset')

text_col_header = 'text'
label_col_header = 'label'

### Reading test and train data to keep them ready for testing

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Read Train data
ip_train_file = os.path.join(INPUT_DIR, 'speechact_train.csv')
df_train = pd.read_csv(ip_train_file)
df_train = df_train.astype({text_col_header: str, label_col_header: int})

# Read Test data
ip_test_file = os.path.join(INPUT_DIR, 'speechact_test.csv')
df_test = pd.read_csv(ip_test_file)
df_test = df_test.astype({text_col_header: str, label_col_header: int})

# Support Vector Machines

Requirements
<li> Input - Vectorized Data train </li>
<li> Input - Vectorized Label train </li>

### Counter Vector with stopwords

In [11]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_stopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_stopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Train and Test the model

In [12]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_stopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [13]:
# Run KNN
estimator = KNeighborsClassifier()
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [14]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.8180813499016146
Test Accuracy 0.7375741239892183
F1 macro Score:  0.7374369422425437
              precision    recall  f1-score   support

           0       0.92      0.53      0.67      3911
           1       0.74      0.84      0.79      2908
           2       0.63      0.94      0.75      2456

    accuracy                           0.74      9275
   macro avg       0.76      0.77      0.74      9275
weighted avg       0.78      0.74      0.73      9275



### Counter Vector with no stopwords

In [15]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_nostopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_nostopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Train and Test the model

In [16]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_nostopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [17]:
# Run KNN
estimator = KNeighborsClassifier()
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [18]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.6491280088412087
Test Accuracy 0.4821563342318059
F1 macro Score:  0.4638170698812691
              precision    recall  f1-score   support

           0       0.68      0.19      0.30      3911
           1       0.42      0.52      0.46      2908
           2       0.49      0.90      0.63      2456

    accuracy                           0.48      9275
   macro avg       0.53      0.54      0.46      9275
weighted avg       0.54      0.48      0.44      9275



### Counter Vector with minimum occurences of words (min_occurences=2)

In [19]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_minoccurences_2.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_minoccurences_2.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Train and Test the model

In [20]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_minoccurences_2.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [21]:
# Run KNN
estimator = KNeighborsClassifier()
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [22]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.685274535701771
Test Accuracy 0.554177897574124
F1 macro Score:  0.5619758483845737
              precision    recall  f1-score   support

           0       0.64      0.31      0.42      3911
           1       0.43      0.65      0.52      2908
           2       0.69      0.83      0.75      2456

    accuracy                           0.55      9275
   macro avg       0.59      0.60      0.56      9275
weighted avg       0.59      0.55      0.54      9275

