In [6]:
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import os
import re
from os import listdir
from os.path import isfile, join
import sklearn
import nltk
from nltk.corpus import stopwords 

from sklearn.svm import SVC

In [2]:
INPUT_DIR = '../data/input/groundtruth/'
OUTPUT_DIR = '../data/input/groundtruth/'

CORPUS_DIR = os.path.join(OUTPUT_DIR, 'corpus')
VECTORIZED_DIR = os.path.join(OUTPUT_DIR, 'vectorized_trainset')

text_col_header = 'text'
label_col_header = 'label'

### Reading test and train data to keep them ready for testing

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Read Train data
ip_train_file = os.path.join(INPUT_DIR, 'speechact_train.csv')
df_train = pd.read_csv(ip_train_file)
df_train = df_train.astype({text_col_header: str, label_col_header: int})

# Read Test data
ip_test_file = os.path.join(INPUT_DIR, 'speechact_test.csv')
df_test = pd.read_csv(ip_test_file)
df_test = df_test.astype({text_col_header: str, label_col_header: int})

# Support Vector Machines

Requirements
<li> Input - Vectorized Data train </li>
<li> Input - Vectorized Label train </li>

### Counter Vector with stopwords

In [4]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_stopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_stopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Testing

In [5]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_stopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [7]:
# Run SVM
estimator = SVC(kernel='poly', degree=3)
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [8]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.879727216367018
Test Accuracy 0.812722371967655
F1 macro Score:  0.8149837538601369
              precision    recall  f1-score   support

           0       0.95      0.73      0.82      3911
           1       0.94      0.81      0.87      2908
           2       0.62      0.95      0.75      2456

    accuracy                           0.81      9275
   macro avg       0.84      0.83      0.81      9275
weighted avg       0.86      0.81      0.82      9275



### Counter Vector with no stopwords

In [9]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_nostopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_nostopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Testing

In [10]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_nostopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [11]:
# Run SVM
estimator = SVC(kernel='poly', degree=3)
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [12]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.793660206474568
Test Accuracy 0.5651752021563342
F1 macro Score:  0.5647069475440422
              precision    recall  f1-score   support

           0       0.98      0.31      0.47      3911
           1       0.42      0.99      0.59      2908
           2       0.97      0.47      0.64      2456

    accuracy                           0.57      9275
   macro avg       0.79      0.59      0.56      9275
weighted avg       0.80      0.57      0.55      9275



### Counter Vector with minimum occurences of words (min_occurences=2)

In [13]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_minoccurences_2.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_minoccurences_2.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Testing

In [14]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_minoccurences_2.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [15]:
# Run SVM
estimator = SVC(kernel='poly', degree=3)
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [16]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.795061861505701
Test Accuracy 0.5818867924528301
F1 macro Score:  0.5838821222446648
              precision    recall  f1-score   support

           0       0.97      0.34      0.50      3911
           1       0.43      0.98      0.60      2908
           2       0.96      0.49      0.65      2456

    accuracy                           0.58      9275
   macro avg       0.79      0.60      0.58      9275
weighted avg       0.80      0.58      0.57      9275

