In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from collections import Counter
import os
import re
from os import listdir
from os.path import isfile, join
import sklearn
import nltk
from nltk.corpus import stopwords 

from sklearn.linear_model import LogisticRegression


In [2]:
INPUT_DIR = '../data/input/groundtruth/'
OUTPUT_DIR = '../data/input/groundtruth/'

CORPUS_DIR = os.path.join(OUTPUT_DIR, 'corpus')
VECTORIZED_DIR = os.path.join(OUTPUT_DIR, 'vectorized_trainset')

text_col_header = 'text'
label_col_header = 'label'

### Reading test and train data to keep them ready for testing

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Read Train data
ip_train_file = os.path.join(INPUT_DIR, 'speechact_train.csv')
df_train = pd.read_csv(ip_train_file)
df_train = df_train.astype({text_col_header: str, label_col_header: int})

# Read Test data
ip_test_file = os.path.join(INPUT_DIR, 'speechact_test.csv')
df_test = pd.read_csv(ip_test_file)
df_test = df_test.astype({text_col_header: str, label_col_header: int})

# Logistic Regression

Requirements
<li> Input - Vectorized Data train </li>
<li> Input - Vectorized Label train </li>

### Counter Vector with stopwords

In [4]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_stopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_stopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Testing

In [5]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_stopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [6]:
# Run Logistic Regression
estimator = LogisticRegression(max_iter=1000)
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [46]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.9875737890509179
Test Accuracy 0.9392991913746631
F1 macro Score:  0.9383546832567413
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      3911
           1       0.94      0.93      0.94      2908
           2       0.92      0.94      0.93      2456

    accuracy                           0.94      9275
   macro avg       0.94      0.94      0.94      9275
weighted avg       0.94      0.94      0.94      9275



### Store estimator and run sample sentence

In [10]:
ESTIMATOR_DIR = '../models/'
estimator_file = os.path.join(ESTIMATOR_DIR, 'estimator_speechact.pkl')

with open(estimator_file, 'wb') as f_ip:
    pkl.dump(estimator, f_ip)

#### Sample sentence

In [7]:
import re

def preprocess_text(text):
    
    # print(text, end ='')
    
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
        
    words = text.split(' ')
    words = [w.strip() for w in words]
    
    text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

def text_to_speechact(text, estimator_file, embedding_file, corpus_file):

    # text = preprocess_text(sample_sentence).strip()
    text = preprocess_text(text).strip()
    
    textlist = []
    ret_result = 'None'
    
    if text:
        
        # The following code can be pushed into the initialization
        # portion of the application to improve efficiency
        # ------ BEGIN ----------

        # Read the embedding
        with open(embedding_file, 'rb') as f_ip:
            vectorizer = pkl.load(f_ip)
            
        # Read the estimator
        with open(estimator_file, 'rb') as f_ip:
            estimator = pkl.load(f_ip)
            
        # -------- END -----------
        
        # Vectorize the input text
        textlist.append(text)
        text_vector = vectorizer.transform(textlist)
        
        # Estimate the predicted value
        pred = estimator.predict(text_vector)
        pred_val = pred[0]

        # Return the prediction in the form of a string
        # Efficiency: Return as integers and convert in the last responsible moment
        if pred_val == 0:
            ret_result = 'statement'
        elif pred_val == 1:
            ret_result = 'interrogative'
        elif pred_val == 2:
            ret_result = 'imperative'
            
    return ret_result

In [14]:
def speechact_wrapper(sample_sentence):
    
    estimator_file = '../models/estimator_speechact.pkl'
    embedding_file = '../data/input/groundtruth/corpus/vector_countvector_stopword.pkl'
    
    speech_act = text_to_speechact(text=sample_sentence, 
                      estimator_file=estimator_file, 
                      embedding_file=embedding_file, 
                      corpus_file=None)
    
    return speech_act

print(speechact_wrapper('What is the name of the person'))
print(speechact_wrapper('Go away'))

interrogative
imperative


### Counter Vector with no stopwords

In [47]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_nostopword.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_nostopword.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Testing

In [48]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_nostopword.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [49]:
# Run Logistic Regression
estimator = LogisticRegression(max_iter=1000)
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [50]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.9605380198927195
Test Accuracy 0.8659838274932614
F1 macro Score:  0.8661387567725706
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      3911
           1       0.79      0.85      0.82      2908
           2       0.91      0.88      0.89      2456

    accuracy                           0.87      9275
   macro avg       0.87      0.87      0.87      9275
weighted avg       0.87      0.87      0.87      9275



### Counter Vector with minimum occurences of words (min_occurences=2)

In [51]:
# Read the vectorized train data and train label
vectorized_train_data_file = os.path.join(VECTORIZED_DIR, 'train_data_countvector_minoccurences_2.pkl')
with open(vectorized_train_data_file, 'rb') as f_ip:
    data_train = pkl.load(f_ip)
    
train_label_file = os.path.join(VECTORIZED_DIR, 'train_label_countvector_minoccurences_2.pkl')
with open(train_label_file, 'rb') as f_ip:
    label_train = pkl.load(f_ip)

#### Testing

In [52]:
# Read the vectorizer to transform the test data
vector_file = os.path.join(CORPUS_DIR, 'vector_countvector_minoccurences_2.pkl')
with open(vector_file, 'rb') as f_ip:
    vectorizer = pkl.load(f_ip)

data_test = vectorizer.transform(df_test[text_col_header])

In [53]:
# Run Logistic Regression
estimator = LogisticRegression(max_iter=1000)
estimator.fit(data_train, np.ravel(label_train, order='C'))
predictions_test = estimator.predict(data_test)

#### Performance metrics

In [54]:
from sklearn.metrics import f1_score

## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, df_train[label_col_header]))
print('Test Accuracy', estimator.score(data_test, df_test[label_col_header]))

# F1
f1_measure = f1_score(df_test[label_col_header], predictions_test, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(df_test[label_col_header], predictions_test))

Train Accuracy 0.9453624086902612
Test Accuracy 0.8636118598382749
F1 macro Score:  0.863375437946586
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      3911
           1       0.78      0.85      0.81      2908
           2       0.90      0.87      0.89      2456

    accuracy                           0.86      9275
   macro avg       0.86      0.86      0.86      9275
weighted avg       0.87      0.86      0.86      9275

