In [22]:
import numpy as np
import pandas as pd
import pickle as pkl
import sklearn
import nltk
import os

from nltk.corpus import stopwords 
from collections import Counter


In [2]:
INPUT_DIR = '../data/input/imperatives/ground_truth/'
OUTPUT_DIR = '../data/'
MODEL_DIR = '../models/'

In [3]:
ip_file = os.path.join(INPUT_DIR, 'imperatives_binary_data.csv')
df_data_raw = pd.read_csv(ip_file)

In [4]:
df_data_raw

Unnamed: 0,text,label
0,Find a sturdy piece of cardboard in the form o...,1
1,Stand up for yourself,1
2,Fix out priorities together in a meeting a co...,1
3,Make one last snowball for the penguin's head,1
4,Look for the internet venue you will use for y...,1
...,...,...
2319,it's a Finnish documentary but it has all thes...,0
2320,yeah because you took time when you had your ...,0
2321,oh come on you're kidding right.,0
2322,You see.,0


In [5]:
from nltk.corpus import wordnet

def get_synonyms(word):
    synonyms = []
    # antonyms = []

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
                
    return list(set(synonyms))

In [6]:
import re

def replace_synonym(sentence, word):
    
    augmented_sentences = []
    augmented_sentences.append(sentence)
    
    sentence_words = sentence.split(' ')
    
    if word in sentence_words:
        
        synonyms = get_synonyms(word)
    
        for synonym in synonyms:
            reg_ex = r'\b'+word+r'\b'
            new_setence = re.sub(reg_ex, synonym, sentence)
        
            augmented_sentences.append(new_setence)
        
    return augmented_sentences

In [7]:
sentence = 'This is a good phone'
word = 'is'
print(replace_synonym(sentence, word))

['This is a good phone', 'This constitute a good phone', 'This exist a good phone', 'This embody a good phone', 'This be a good phone', 'This represent a good phone', 'This follow a good phone', 'This comprise a good phone', 'This equal a good phone', 'This personify a good phone', 'This cost a good phone', 'This live a good phone', 'This make_up a good phone']


In [8]:
def synonym_augmentation(sentence):
    
    sentences = []
    
    words = sentence.split(' ')
    
    for word in words:
        # print('---------------------\n', word)
        new_sentences = replace_synonym(sentence, word)
        # print(new_sentences)
        sentences.extend(new_sentences)
        
    return list(set(sentences))

In [9]:
def synonym_augmentation_withoutstopwords(sentence):
    
    sentences = []
    
    words = sentence.split(' ')
    
    for word in words:
        # print('---------------------\n', word)
        if word not in stopwords.words('english'):
            new_sentences = replace_synonym(sentence, word)
            # print(new_sentences)
            sentences.extend(new_sentences)
        
    return list(set(sentences))

In [10]:
sentence = 'This is a good phone'
print(synonym_augmentation_withoutstopwords(sentence))

['This is a in_effect phone', 'This is a good sound', 'This is a goodness phone', 'This is a skillful phone', 'This is a good earphone', 'This is a adept phone', 'This is a good phone', 'This is a honest phone', 'This is a good earpiece', 'This is a proficient phone', 'This is a dependable phone', 'This is a near phone', 'This is a honorable phone', 'This is a trade_good phone', 'This is a good telephone_set', 'This is a good speech_sound', 'This is a dear phone', 'This is a upright phone', 'This is a sound phone', 'This is a respectable phone', 'This is a soundly phone', 'This is a full phone', 'This is a safe phone', 'This is a secure phone', 'This is a salutary phone', 'This is a thoroughly phone', 'This is a good headphone', 'This is a estimable phone', 'This is a just phone', 'This is a serious phone', 'This is a good ring', 'This is a beneficial phone', 'This is a well phone', 'This is a good telephone', 'This is a expert phone', 'This is a ripe phone', 'This is a effective phone

In [11]:
df_imperative = df_data_raw.loc[df_data_raw['label'] == 1]
df_nonimperative = df_data_raw.loc[df_data_raw['label'] == 0]

In [16]:
def get_augmented_dataframe(sentence, label):
    
    augmented_sentences = synonym_augmentation_withoutstopwords(sentence)
    labels = [label] * len(augmented_sentences)
    
    df = pd.DataFrame(list(zip(augmented_sentences, labels)), columns=['text', 'label'])
    
    return df

In [17]:
df_imperative.head()

Unnamed: 0,text,label
0,Find a sturdy piece of cardboard in the form o...,1
1,Stand up for yourself,1
2,Fix out priorities together in a meeting a co...,1
3,Make one last snowball for the penguin's head,1
4,Look for the internet venue you will use for y...,1


In [18]:
df_aug_imperative = pd.DataFrame(columns=['text', 'label'])
for index, row in df_imperative.iterrows():
    
    sentence = row['text']
    label = row['label']
    
    df = get_augmented_dataframe(sentence, label)
    
    df_aug_imperative = pd.concat([df_aug_imperative, df], axis=0, sort=False)

In [19]:
df_aug_imperative.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66115 entries, 0 to 11
Data columns (total 2 columns):
text     66115 non-null object
label    66115 non-null object
dtypes: object(2)
memory usage: 1.5+ MB


In [23]:
Counter(df_aug_imperative['label'])

Counter({1: 66115})

In [24]:
df_aug_imperative.head()

Unnamed: 0,text,label
0,Find a sturdy piece of cardboard in the frame ...,1
1,Find a hardy piece of cardboard in the form of...,1
2,Find a sturdy piece of cardboard in the soma o...,1
3,Find a sturdy piece of cardboard in the form o...,1
4,Find a sturdy piece of cardboard in the class ...,1


In [25]:
df_aug_nonimperative = pd.DataFrame(columns=['text', 'label'])
for index, row in df_nonimperative.iterrows():
    
    sentence = row['text']
    label = row['label']
    
    df = get_augmented_dataframe(sentence, label)
    
    df_aug_nonimperative = pd.concat([df_aug_nonimperative, df], axis=0, sort=False)

In [26]:
Counter(df_aug_nonimperative['label'])

Counter({0: 58764})

In [27]:
df_aug_nonimperative.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58764 entries, 0 to 63
Data columns (total 2 columns):
text     58764 non-null object
label    58764 non-null object
dtypes: object(2)
memory usage: 1.3+ MB


In [28]:
df_aug_nonimperative.head()

Unnamed: 0,text,label
0,A cockroach will endure nine days without it’s...,0
1,A cockroach will live nine sidereal_day withou...,0
2,A cockroach will lively nine days without it’s...,0
3,A cockroach will live ix days without it’s hea...,0
4,A cockroach will springy nine days without it’...,0


In [29]:
df_data = pd.concat([df_aug_nonimperative, df_aug_imperative], axis=0, sort=False)

In [32]:
ip_file = os.path.join(INPUT_DIR, 'imperatives_binary_synonym_aug.csv')
df_data.to_csv(ip_file, index=False)

In [33]:
Counter(df_data['label'])

Counter({0: 58764, 1: 66115})

In [34]:
data = df_data['text']
label = df_data['label']

In [59]:
from sklearn.model_selection import train_test_split

data_train_text, data_test_text, label_train, label_test = train_test_split(data, label, test_size=0.20, 
                                                                  random_state=0, stratify=label)

In [60]:
print('Training Data split', Counter(label_train))
print('Testing Data split', Counter(label_test))

Training Data split Counter({1: 52892, 0: 47011})
Testing Data split Counter({1: 13223, 0: 11753})


In [61]:
df_data_train = pd.DataFrame(list(zip(data_train_text, label_train)), columns=['text', 'label'])
df_data_test = pd.DataFrame(list(zip(data_test_text, label_test)), columns=['text', 'label'])

In [62]:
df_data_test.head()

Unnamed: 0,text,label
0,Fix out priorities together in a meeting a co...,1
1,and smell at one of those you get more than o...,0
2,Hmmm she says Then why are you calling here ...,0
3,It's like hold_out backwards.,0
4,Differentiate age genders and skill levels t...,1


## Preprocess data

In [63]:
import re

def preprocess_text(text):
    
    # print(text, end ='')
    
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
        
    words = text.split(' ')
    words = [w.strip() for w in words if w not in stopwords.words('english')]
    
    text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

In [64]:
def preprocess_df(df_data):
    df_data['processed_text'] = df_data['text'].apply(lambda x: preprocess_text(x))
    df_data.drop(df_data[df_data['processed_text'] == ''].index, inplace=True)
    
    return df_data

In [41]:
# Preprocess train & test data frame
print('Train before cleaning', df_data_train.shape)
df_data_train = preprocess_df(df_data_train)
print('Train after cleaning', df_data_train.shape)


print('Test before cleaning', df_data_test.shape)
df_data_test = preprocess_df(df_data_test)
print('Test after cleaning', df_data_test.shape)

Train before cleaning (99903, 2)
Train after cleaning (99853, 3)
Test before cleaning (24976, 2)
Test after cleaning (24957, 3)


In [42]:
df_data_test.head()

Unnamed: 0,text,label,processed_text
0,Fix out priorities together in a meeting a co...,1,fix priorities together meeting couple weeks s...
1,and smell at one of those you get more than o...,0,smell one get one frequency
2,Hmmm she says Then why are you calling here ...,0,hmmm says calling travel past broad
3,It's like hold_out backwards.,0,like hold_out backwards
4,Differentiate age genders and skill levels t...,1,differentiate age genders skill levels determi...


### Create Corpus using only train data

In [43]:
corpus = df_data_train['text'].values
print('Corpus Length ', len(corpus))

Corpus Length  99853


### Vectorization

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

## Use Bag of Words Vectorizer for encoding`
vectorizer = CountVectorizer()
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### Vectorization of Train

In [45]:
data_train = vectorizer.transform(df_data_train['text'])
print('Shape of the data train:',data_train.shape)

Shape of the data train: (99853, 14490)


### Train

In [46]:
label_train = np.array(df_data_train['label'])
label_train = label_train.reshape((len(label_train), 1))
label_train

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [0]])

### Test


In [47]:
data_test = vectorizer.transform(df_data_test['text'])
print('Shape of the data train:',data_test.shape)

Shape of the data train: (24957, 14490)


In [48]:
label_test = np.array(df_data_test['label'])
label_test = label_test.reshape((len(label_test), 1))

# Estimator

## SVM with poly (3) kernel - no stopwords

In [130]:
from sklearn.svm import SVC

# Run Logistic Regression
estimator = SVC(kernel='poly', degree=3)
estimator.fit(data_train, label_train)
predictions = estimator.predict(data_test)

predictions

  y = column_or_1d(y, warn=True)


array([1, 1, 1, ..., 1, 1, 1])

In [131]:
from sklearn.metrics import f1_score


## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, label_train))
print('Test Accuracy', estimator.score(data_test, label_test))

# F1
f1_measure = f1_score(label_test, predictions, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(label_test, predictions))

Train Accuracy 0.9488123543772315
Test Accuracy 0.9449833041819049
F1 macro Score:  0.9449259221238242
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     16073
           1       0.90      1.00      0.95     15372

    accuracy                           0.94     31445
   macro avg       0.95      0.95      0.94     31445
weighted avg       0.95      0.94      0.94     31445



In [133]:
model_file = MODEL_DIR + 'svm/' + 'model_svm_poly3_synonym_aug.pkl'
with open(model_file, 'wb') as f_op:
    pkl.dump(estimator, f_op)

## SVM with linear kernel - no stopwords

In [49]:
from sklearn.svm import SVC

# Run Logistic Regression
estimator = SVC(kernel='linear')
estimator.fit(data_train, label_train)
predictions = estimator.predict(data_test)

predictions

  y = column_or_1d(y, warn=True)


array([1, 0, 0, ..., 1, 0, 1])

In [50]:
from sklearn.metrics import f1_score


## Accuracy Measure
print('Train Accuracy', estimator.score(data_train, label_train))
print('Test Accuracy', estimator.score(data_test, label_test))

# F1
f1_measure = f1_score(label_test, predictions, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(label_test, predictions))

Train Accuracy 0.9956536108078876
Test Accuracy 0.9912249068397644
F1 macro Score:  0.9911904864036325
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     11739
           1       0.99      0.99      0.99     13218

    accuracy                           0.99     24957
   macro avg       0.99      0.99      0.99     24957
weighted avg       0.99      0.99      0.99     24957



In [51]:
model_file = MODEL_DIR + 'svm/' + 'model_svm_linear_synonym_aug.pkl'
with open(model_file, 'wb') as f_op:
    pkl.dump(estimator, f_op)

## LSTM - Vanilla - no stopwords

In [52]:
import numpy

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

In [53]:
from keras.datasets import imdb


In [54]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [57]:
embed_dim = 128
lstm_out = 196
batch_size = 32
N = len(dg.columns[1:])

model = Sequential()
model.add(Embedding(2000, embed_dim,input_length = X.shape[1]))
model.add(LSTM(lstm_out))
model.add(Dense(N,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
       list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369,

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [68]:
max_fatures = 30000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(result['Reviews'].values)
X1 = tokenizer.texts_to_sequences(result['Reviews'].values)
X1 = pad_sequences(X1)
Y1 = pd.get_dummies(result['Positivity']).values
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1,Y1, random_state = 42)
print(X1_train.shape,Y1_train.shape)
print(X1_test.shape,Y1_test.shape)



(93659, 91) (93659, 2)
(31220, 91) (31220, 2)


In [69]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence
# define the document
text = 'The quick brown fox jumped over the lazy dog.'
# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)
# integer encode the document
result = one_hot(text, round(vocab_size*1.3))
print(result)

8
[7, 3, 3, 4, 4, 9, 7, 1, 7]


In [70]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence
# define the document
text = 'The quick brown fox jumped over the lazy dog.'
# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)
# integer encode the document
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5')
print(result)

8
[6, 4, 1, 2, 7, 5, 6, 2, 6]
