In [0]:
# load packages
import nltk
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from spacy.lang.en import English
from sklearn.ensemble import RandomForestClassifier
import re 
import spacy
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler,SMOTE
import random

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SpatialDropout1D,LSTM,Dense,Dropout,Conv1D,MaxPooling1D,Flatten
from tensorflow.keras.losses import binary_crossentropy
from keras.utils.np_utils import to_categorical
from sklearn.metrics import classification_report

In [0]:
df_train = pd.read_csv('train_original.csv')
df_test = pd.read_csv('test_original.csv')
df_simulated_spacy = pd.read_csv('train_spacy_simulated.csv')
df_simulated_newSpacy = pd.read_csv('train_newSpacy_simulated.csv')

In [0]:
# preprocessing method 1
citation_pattern = r'\((.*?)\)'
nlp = en_core_web_sm.load()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation
def spacy_token(text):
    text = re.sub(citation_pattern,'CIT',text)
    text = re.sub('\-',' ',text)
    text = re.sub('\d','',text)
    mytokens = nlp(text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations]
    return ' '.join(mytokens)

In [0]:
# preprocessing method 2
def new_spacy_token(text):
    text = re.sub(citation_pattern,'cit',text) 
    text = re.sub('\-',' ',text)
    my_token = nlp(text)
    overall_list = []
    for token in my_token:
        if token.lemma_ == 'cit':
            overall_list.append(token.lemma_)
        if token.pos_ == 'VERB' and token.lemma_ != 'cit':
            overall_list.append(token.lemma_)
        if token.pos_== 'ADV' and token.lemma_ != 'cit':
            overall_list.append(token.lemma_)
        if token.pos_ == 'ADJ' and token.lemma_ != 'cit':
            overall_list.append(token.lemma_)
        if token.pos_== 'NOUN' and token.lemma_ != 'cit':
            overall_list.append(token.lemma_)
    return ' '.join([w for w in overall_list if w not in punctuations])

In [0]:
# load data
df_train['spacy_text'] = df_train['text'].apply(spacy_token)
df_train['new_spacy_text'] = df_train['text'].apply(new_spacy_token)
df_test['spacy_text'] = df_test['text'].apply(spacy_token)
df_test['new_spacy_text'] = df_test['text'].apply(new_spacy_token)

In [0]:
# split data
train_X_p1,test_X_p1,train_y_p1,test_y_p1 = list(df_train['spacy_text']),list(df_test['spacy_text']),df_train.labels.values,df_test.labels.values
train_X_p2,test_X_p2,train_y_p2,test_y_p2 = list(df_train['new_spacy_text']),list(df_test['new_spacy_text']),df_train.labels.values,df_test.labels.values

In [0]:
# prepare data for vectorizer
train_whole_text_p1 = train_X_p1.copy()
train_whole_text_p2 = train_X_p2.copy()

In [0]:
train_whole_text_p1.extend(test_X_p1)

In [0]:
train_whole_text_p2.extend(test_X_p2)

In [0]:
tk = Tokenizer(lower = True,filters='')
tk.fit_on_texts(train_whole_text_p1)
max_len = 50
train_tokenized = tk.texts_to_sequences(train_X_p1)
test_tokenized = tk.texts_to_sequences(test_X_p1)
train_X_p1 = pad_sequences(train_tokenized,maxlen= max_len)
test_X_p1 = pad_sequences(test_tokenized, maxlen = max_len)

In [0]:
tk = Tokenizer(lower = True,filters='')
tk.fit_on_texts(train_whole_text_p2)
max_len = 50
train_tokenized = tk.texts_to_sequences(train_X_p2)
test_tokenized = tk.texts_to_sequences(test_X_p2)
train_X_p2 = pad_sequences(train_tokenized,maxlen= max_len)
test_X_p2 = pad_sequences(test_tokenized, maxlen = max_len)

In [0]:
print(train_X_p1.shape)
print(train_X_p2.shape)

(7003, 50)
(7003, 50)


In [0]:
# lables to categorical
train_y_p1 = to_categorical(train_y_p1, num_classes = 3)
train_y_p2 = to_categorical(train_y_p2, num_classes = 3)
test_y_p1 = to_categorical(test_y_p1, num_classes = 3)
test_y_p2 = to_categorical(test_y_p2, num_classes = 3)

## CNN model

In [0]:
model = Sequential()
model.add(Embedding(20000, 100, input_length= 50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(train_X_p1, train_y_p1,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data = (test_X_p1,test_y_p1),
                    shuffle = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4b4640ca20>

In [0]:
preds = model.predict(test_X_p1)
print(classification_report(np.argmax(test_y_p1,axis=1),np.argmax(preds,axis=1)))

              precision    recall  f1-score   support

           0       0.15      0.10      0.12        52
           1       0.47      0.36      0.40       152
           2       0.92      0.95      0.93      1529

    accuracy                           0.87      1733
   macro avg       0.51      0.47      0.48      1733
weighted avg       0.85      0.87      0.86      1733



In [0]:
model = Sequential()
model.add(Embedding(20000, 100, input_length= 50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(train_X_p2, train_y_p2,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data = (test_X_p2,test_y_p2),
                    shuffle = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4b40412e48>

In [0]:
preds = model.predict(test_X_p2)
print(classification_report(np.argmax(test_y_p2,axis=1),np.argmax(preds,axis=1)))

              precision    recall  f1-score   support

           0       0.19      0.15      0.17        52
           1       0.52      0.38      0.44       152
           2       0.92      0.95      0.93      1529

    accuracy                           0.88      1733
   macro avg       0.54      0.49      0.51      1733
weighted avg       0.86      0.88      0.87      1733



## Using Oversample

In [0]:
ros = RandomOverSampler(random_state=0)
train_X_p1, train_y_p1 = ros.fit_sample(train_X_p1, train_y_p1)
ros = RandomOverSampler(random_state=0)
train_X_p2, train_y_p2 = ros.fit_sample(train_X_p2, train_y_p2)



In [0]:
model = Sequential()
model.add(Embedding(20000, 100, input_length= 50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(train_X_p1, train_y_p1,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data = (test_X_p1,test_y_p1),
                    shuffle = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4abe5aecf8>

In [0]:
preds = model.predict(test_X_p1)
print(classification_report(np.argmax(test_y_p1,axis=1),np.argmax(preds,axis=1)))

              precision    recall  f1-score   support

           0       0.16      0.12      0.13        52
           1       0.35      0.38      0.37       152
           2       0.92      0.92      0.92      1529

    accuracy                           0.85      1733
   macro avg       0.48      0.47      0.47      1733
weighted avg       0.84      0.85      0.85      1733



In [0]:
model = Sequential()
model.add(Embedding(20000, 100, input_length= 50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(train_X_p2, train_y_p2,
                    batch_size=128,
                    epochs=20,
                    verbose=1,
                    validation_data = (test_X_p2,test_y_p2),
                    shuffle = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4abc4c1390>

In [0]:
preds = model.predict(test_X_p2)
print(classification_report(np.argmax(test_y_p2,axis=1),np.argmax(preds,axis=1)))

              precision    recall  f1-score   support

           0       0.10      0.08      0.09        52
           1       0.47      0.45      0.46       152
           2       0.92      0.93      0.93      1529

    accuracy                           0.86      1733
   macro avg       0.50      0.49      0.49      1733
weighted avg       0.86      0.86      0.86      1733



## Markov Chain

In [0]:
# Add 300 samples from Negative Class
df_simulated_spacy_n = df_simulated_spacy[df_simulated_spacy['labels'] == 0]
df_simulated_spacy_p = df_simulated_spacy[df_simulated_spacy['labels'] == 1]

In [0]:
train_X_p1,test_X_p1,train_y_p1,test_y_p1 = list(df_train['spacy_text']),list(df_test['spacy_text']),df_train.labels.values,df_test.labels.values

In [0]:
temp_df = df_simulated_spacy_n.sample(n = 300)

In [0]:
train_X_p1.extend(list(temp_df.spacy_text))

In [0]:
train_y_p1 = np.append(train_y_p1,temp_df.labels.values)

In [0]:
train_whole_text_p1 = train_X_p1.copy()

In [0]:
train_whole_text_p1.extend(test_X_p1)

In [0]:
tk = Tokenizer(lower = True,filters='')
tk.fit_on_texts(train_whole_text_p1)
max_len = 50
train_tokenized = tk.texts_to_sequences(train_X_p1)
test_tokenized = tk.texts_to_sequences(test_X_p1)
train_X_p1 = pad_sequences(train_tokenized,maxlen= max_len)
test_X_p1 = pad_sequences(test_tokenized, maxlen = max_len)

In [0]:
print(train_X_p1.shape)
print(test_X_p1.shape)

(7303, 50)
(1733, 50)


In [0]:
train_y_p1 = to_categorical(train_y_p1, num_classes = 3)
test_y_p1 = to_categorical(test_y_p1, num_classes = 3)

In [0]:
model = Sequential()
model.add(Embedding(20000, 100, input_length= 50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(train_X_p1, train_y_p1,
                    batch_size=128,
                    epochs=50,
                    verbose=1,
                    validation_data = (test_X_p1,test_y_p1),
                    shuffle = True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4aadb3de10>

In [0]:
preds = model.predict(test_X_p1)
print(classification_report(np.argmax(test_y_p1,axis=1),np.argmax(preds,axis=1)))

              precision    recall  f1-score   support

           0       0.17      0.10      0.12        52
           1       0.28      0.34      0.31       152
           2       0.92      0.91      0.91      1529

    accuracy                           0.83      1733
   macro avg       0.46      0.45      0.45      1733
weighted avg       0.84      0.83      0.84      1733



In [0]:
# try add all positive and negative cases
train_X_p1,test_X_p1,train_y_p1,test_y_p1 = list(df_train['spacy_text']),list(df_test['spacy_text']),df_train.labels.values,df_test.labels.values
train_X_p1.extend(list(df_simulated_spacy.spacy_text))
train_y_p1 = np.append(train_y_p1,df_simulated_spacy.labels.values)

In [0]:
train_whole_text_p1 = train_X_p1.copy()
train_whole_text_p1.extend(test_X_p1)
tk = Tokenizer(lower = True,filters='')
tk.fit_on_texts(train_whole_text_p1)
max_len = 50
train_tokenized = tk.texts_to_sequences(train_X_p1)
test_tokenized = tk.texts_to_sequences(test_X_p1)
train_X_p1 = pad_sequences(train_tokenized,maxlen= max_len)
test_X_p1 = pad_sequences(test_tokenized, maxlen = max_len)

In [0]:
print(train_X_p1.shape)
print(test_X_p1.shape)

(8483, 50)
(1733, 50)


In [0]:
train_y_p1 = to_categorical(train_y_p1, num_classes = 3)
test_y_p1 = to_categorical(test_y_p1, num_classes = 3)

In [0]:
model = Sequential()
model.add(Embedding(20000, 100, input_length= 50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, 2, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(train_X_p1, train_y_p1,
                    batch_size=128,
                    epochs=50,
                    verbose=1,
                    validation_data = (test_X_p1,test_y_p1),
                    shuffle = True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4b54c11710>

In [0]:
preds = model.predict(test_X_p1)
print(classification_report(np.argmax(test_y_p1,axis=1),np.argmax(preds,axis=1)))

              precision    recall  f1-score   support

           0       0.12      0.12      0.12        52
           1       0.32      0.43      0.37       152
           2       0.92      0.89      0.91      1529

    accuracy                           0.83      1733
   macro avg       0.45      0.48      0.46      1733
weighted avg       0.85      0.83      0.83      1733

