In [82]:
import numpy as np
import pandas as pd
import pickle as pkl
import sklearn
import nltk
import os

from nltk.corpus import stopwords 

In [83]:
INPUT_DIR = '../data/input/imperatives/ground_truth/'
OUTPUT_DIR = '../data/'
MODEL_DIR = '../models/'

In [84]:
ip_file = os.path.join(INPUT_DIR, 'imperatives_binary_data.csv')
df_data = pd.read_csv(ip_file)

In [85]:
df_data

Unnamed: 0,text,label
0,Find a sturdy piece of cardboard in the form o...,1
1,Stand up for yourself,1
2,Fix out priorities together in a meeting a co...,1
3,Make one last snowball for the penguin's head,1
4,Look for the internet venue you will use for y...,1
...,...,...
2319,it's a Finnish documentary but it has all thes...,0
2320,"""yeah because you took time when you had your...",0
2321,"""oh come on you're kidding right.""",0
2322,You see.,0


In [54]:
data = df_data['text']
label = df_data['label']

In [55]:
from sklearn.model_selection import train_test_split

data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=0.20, 
                                                                  random_state=0, stratify=label)

In [56]:
from collections import Counter

print('Training Data split', Counter(label_train))
print('Testing Data split', Counter(label_test))

Training Data split Counter({1: 936, 0: 923})
Testing Data split Counter({1: 234, 0: 231})


In [57]:
df_data_train = pd.DataFrame(list(zip(data_train, label_train)), columns=['text', 'label'])
df_data_test = pd.DataFrame(list(zip(data_test, label_test)), columns=['text', 'label'])

In [58]:
df_data_test.head()

Unnamed: 0,text,label
0,and my first paper I got a C on and I'd knew t...,0
1,Shut the door,1
2,I took some summer courses.,0
3,You better be quit!,1
4,So.,0


## Preprocess data

In [59]:
import re

def preprocess_text(text):
    
    # print(text, end ='')
    
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
        
    words = text.split(' ')
    words = [w.strip() for w in words if w not in stopwords.words('english')]
    
    text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

In [64]:
def preprocess_df(df_data):
    df_data['processed_text'] = df_data['text'].apply(lambda x: preprocess_text(x))
    df_data.drop(df_data[df_data['processed_text'] == ''].index, inplace=True)
    
    return df_data

In [65]:
# Preprocess train & test data frame
print('Train before cleaning', df_data_train.shape)
df_data_train = preprocess_df(df_data_train)
print('Train after cleaning', df_data_train.shape)


print('Test before cleaning', df_data_test.shape)
df_data_test = preprocess_df(df_data_test)
print('Test after cleaning', df_data_test.shape)

Train before cleaning (1859, 3)
Train after cleaning (1830, 3)
Test before cleaning (465, 3)
Test after cleaning (456, 3)


In [66]:
df_data_test.head()

Unnamed: 0,text,label,processed_text
0,and my first paper I got a C on and I'd knew t...,0,first paper got c knew story really well
1,Shut the door,1,shut door
2,I took some summer courses.,0,took summer courses
3,You better be quit!,1,better quit
5,Do you also have a SSD we can use.,0,also ssd use


### Create Corpus using only train data

In [68]:
corpus = df_data_train['text'].values
print('Corpus Length ', len(corpus))

Corpus Length  1830


### Vectorization

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

## Use Bag of Words Vectorizer for encoding`
vectorizer = CountVectorizer()
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### Vectorization of Train

In [71]:
data_train = vectorizer.transform(df_data_train['text'])
print('Shape of the data train:',data_train.shape)

Shape of the data train: (1830, 3077)


### Train

In [72]:
label_train = np.array(df_data_train['label'])
label_train = label_train.reshape((len(label_train), 1))
label_train

array([[1],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

### Test


In [73]:
data_test = vectorizer.transform(df_data_test['text'])
print('Shape of the data train:',data_test.shape)

Shape of the data train: (456, 3077)


In [74]:
label_test = np.array(df_data_test['label'])
label_test = label_test.reshape((len(label_test), 1))

# Estimator

## SVM with 3-poly kernel

In [75]:
from sklearn.svm import SVC

# Run Logistic Regression
estimator = SVC(kernel='poly', degree=3)
estimator.fit(data_train, label_train)
predictions = estimator.predict(data_test)

predictions

  y = column_or_1d(y, warn=True)


array([0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,

In [76]:
from sklearn.metrics import f1_score


## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, label_train))
print('Test Accuracy', estimator.score(data_test, label_test))

# F1
f1_measure = f1_score(label_test, predictions, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(label_test, predictions))

Train Accuracy 0.7557377049180328
Test Accuracy 0.581140350877193
F1 macro Score:  0.5007308810125712
              precision    recall  f1-score   support

           0       0.82      0.18      0.30       223
           1       0.55      0.96      0.70       233

    accuracy                           0.58       456
   macro avg       0.69      0.57      0.50       456
weighted avg       0.68      0.58      0.51       456



In [81]:
model_file = MODEL_DIR + 'svm/' + 'model_svm_no_aug.pkl'
with open(model_file, 'wb') as f_op:
    pkl.dump(estimator, f_op)

## SVM with linear kernel

In [86]:
from sklearn.svm import SVC

# Run Logistic Regression
estimator = SVC(kernel='linear')
estimator.fit(data_train, label_train)
predictions = estimator.predict(data_test)

predictions

  y = column_or_1d(y, warn=True)


array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,

In [87]:
from sklearn.metrics import f1_score


## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, label_train))
print('Test Accuracy', estimator.score(data_test, label_test))

# F1
f1_measure = f1_score(label_test, predictions, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(label_test, predictions))

Train Accuracy 0.9885245901639345
Test Accuracy 0.8070175438596491
F1 macro Score:  0.8070175438596491
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       223
           1       0.83      0.79      0.81       233

    accuracy                           0.81       456
   macro avg       0.81      0.81      0.81       456
weighted avg       0.81      0.81      0.81       456

