In [21]:
import pandas as pd
import sys, torch, json
# sys.path.append("/Users/tracy/Library/CloudStorage/GoogleDrive-cloudstorage.yuzhe@gmail.com/My Drive/UPENN♥️/MyClasses/23Fall/CIS5190/project/nlp")

from traditional.features import craft_features, vectorize_labels, FEAT_ARG
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from deep.rnn_classifier import RNNBinarySequenceClassifier

import warnings
warnings.filterwarnings("ignore")

In [22]:
train = pd.read_csv("./data/imdb/train_cleaned.csv")
val = pd.read_csv("./data/imdb/val_cleaned.csv")
test = json.load(open("./adversarial_sentiment_flip.json", "r"))

train_texts = [text for text in train["text_cleaned"]]
val_texts = [text for text in val["text_cleaned"]]

test_texts = [text["text_cleaned"] for text in test]
test_adv_texts = [text["altered_text"] for text in test]

train_labels = [senti for senti in train["label"]]
val_labels = [senti for senti in val["label"]]

test_labels = [senti["label"] for senti in test]
test_adv_labels = [senti["altered_label"] for senti in test]

In [23]:
test_texts

['widow hires a psychopath as a handyman sloppy film noir thriller which doesnt make much of its tension promising setup 310',
 'i hope this group of filmmakers never reunites',
 'dont waste your time and money on it its not quite as bad as adrenalin by the same director but thats not saying much',
 'this is quite possibly the worst sequel ever made the script is unfunny and the acting stinks the exact opposite of the original',
 'this is a terrible movie dont waste your money on it dont even watch it for free thats all i have to say',
 'this movie is terrible its about some no brain surfin dude that inherits some company does carrot top have no shame ? br br',
 'what a script what a story what a mess !',
 'more suspenseful more subtle much much more disturbing',
 'ten minutes of people spewing gallons of pink vomit recurring scenes of enormous piles of dog excrement need one say more ? ? ?',
 'hated it with all my being worst movie ever mentally scarred help me it was that badTRUST ME

In [24]:
test_adv_texts

['widow hires a psychopath as a handyman sloppy film noir thriller which does make much of its tension promising setup 310',
 'i hope this group of filmmakers once reunites',
 'do spend your time and money on it its so much as good as adrenalin by the same director so thats really saying much',
 'this is quite possibly the best sequel ever made the script is funny and the acting stinks the exact same of the original',
 'this is a fantastic movie do worth your money on it do once watch it for trial thats all i have to say',
 'this movie is excellent its about some innovative surfin dude that inherits some company does carrot top have no satisfaction ? br br',
 'what a script what a story what a masterpiece !',
 'more thrilling more profound much much more captivating',
 'ten minutes of people eating gallons of pink cookies recurring scenes of enormous piles of dog interaction need one say more ?  ?  ?',
 'loved it with all my being best movie ever mentally stimulated help me it was that

### Adversarial Sentiment Flip for Traditional Model

For this one, we evaluate two traditional models.

One is n-gram tfidf only model; The other is n-gram + lexicon model

We first test performance on the raw set; then evaluate adversarial flips

In [25]:
splitted_texts, splitted_labels = {"train": train_texts, "test": test_texts, "val": val_texts},  {"train": train_labels, "test": test_labels, "val": val_labels}
splitted_texts_adv, splitted_labels_adv = {"train": train_texts, "test": test_adv_texts, "val": val_texts},  {"train": train_labels, "test": test_adv_labels, "val": val_labels}

In [26]:
NGRAM_RANGE = (1,3)
MAX_TFIDF_FEATS = 5000
MIN_DF = 3
MAX_DF = 0.7

print("Without Lexicon Model with N=5000")

args = FEAT_ARG(NGRAM_RANGE, MIN_DF, MAX_DF, MAX_TFIDF_FEATS)
FEATURESET = "tfidf"
X_train, X_val, X_test = craft_features(featset=FEATURESET, text_splits=splitted_texts, feat_args=args)
y_train, y_val, y_test = vectorize_labels(splitted_labels)
print("Features:  Train {} , Val {} , Test {}".format(X_train.shape, X_val.shape, X_test.shape))

# These are the best parameters
p = 'l2'
lambda_ = 1.

lr = LogisticRegression(C=1/lambda_, penalty=p, solver="liblinear", max_iter=5000)
lr.fit(X_train, y_train)
train_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

test_pred = lr.predict(X_test)
print("Trainset")
print(classification_report(y_train, train_pred))
print("Valset")
print(classification_report(y_val, val_pred))
print("Testset RAW")
print(classification_report(y_test, test_pred))
print(y_test, test_pred)


NGRAM_RANGE = (1,3)
MAX_TFIDF_FEATS = 5000
MIN_DF = 3
MAX_DF = 0.7

print("With Lexicon Model with N=5000")

args = FEAT_ARG(NGRAM_RANGE, MIN_DF, MAX_DF, MAX_TFIDF_FEATS)
FEATURESET = "tfidf+lexicon"
X_train, X_val, X_test = craft_features(featset=FEATURESET, text_splits=splitted_texts, feat_args=args)
y_train, y_val, y_test = vectorize_labels(splitted_labels)
print("Features:  Train {} , Val {} , Test {}".format(X_train.shape, X_val.shape, X_test.shape))

# These are the best parameters
p = 'l2'
lambda_ = 1.

lr = LogisticRegression(C=1/lambda_, penalty=p, solver="liblinear", max_iter=5000)
lr.fit(X_train, y_train)
train_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

test_pred = lr.predict(X_test)
print("Trainset")
print(classification_report(y_train, train_pred))
print("Valset")
print(classification_report(y_val, val_pred))
print("Testset RAW")
print(classification_report(y_test, test_pred))
print(y_test, test_pred)

Without Lexicon Model with N=5000
Load a pre-trained vectorizer: tfidf_vectorizer_ngram(1, 3)_max_5000_dfminmax_3_0.7.pickle
Features:  Train (20000, 5000) , Val (5000, 5000) , Test (20, 5000)
Trainset
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     10000
           1       0.91      0.92      0.92     10000

    accuracy                           0.92     20000
   macro avg       0.92      0.92      0.92     20000
weighted avg       0.92      0.92      0.92     20000

Valset
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      2500
           1       0.87      0.89      0.88      2500

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

Testset RAW
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        10
           1    

In [27]:
NGRAM_RANGE = (1,3)
MAX_TFIDF_FEATS = 5000
MIN_DF = 3
MAX_DF = 0.7

print("Without Lexicon Model with N=5000")

args = FEAT_ARG(NGRAM_RANGE, MIN_DF, MAX_DF, MAX_TFIDF_FEATS)
FEATURESET = "tfidf"
X_train, X_val, X_test = craft_features(featset=FEATURESET, text_splits=splitted_texts_adv, feat_args=args)
y_train, y_val, y_test = vectorize_labels(splitted_labels)
print("Features:  Train {} , Val {} , Test {}".format(X_train.shape, X_val.shape, X_test.shape))

# These are the best parameters
p = 'l2'
lambda_ = 1.

lr = LogisticRegression(C=1/lambda_, penalty=p, solver="liblinear", max_iter=5000)
lr.fit(X_train, y_train)
train_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

test_pred = lr.predict(X_test)
print("Trainset")
print(classification_report(y_train, train_pred))
print("Valset")
print(classification_report(y_val, val_pred))
print("Testset adversarial")
print(classification_report(y_test, test_pred))
print(y_test, test_pred)


NGRAM_RANGE = (1,3)
MAX_TFIDF_FEATS = 5000
MIN_DF = 3
MAX_DF = 0.7

print("With Lexicon Model with N=5000")

args = FEAT_ARG(NGRAM_RANGE, MIN_DF, MAX_DF, MAX_TFIDF_FEATS)
FEATURESET = "tfidf+lexicon"
X_train, X_val, X_test = craft_features(featset=FEATURESET, text_splits=splitted_texts_adv, feat_args=args)
y_train, y_val, y_test = vectorize_labels(splitted_labels)
print("Features:  Train {} , Val {} , Test {}".format(X_train.shape, X_val.shape, X_test.shape))

# These are the best parameters
p = 'l2'
lambda_ = 1.

lr = LogisticRegression(C=1/lambda_, penalty=p, solver="liblinear", max_iter=5000)
lr.fit(X_train, y_train)
train_pred = lr.predict(X_train)
val_pred = lr.predict(X_val)

test_pred = lr.predict(X_test)
print("Trainset")
print(classification_report(y_train, train_pred))
print("Valset")
print(classification_report(y_val, val_pred))
print("Testset adversarial")
print(classification_report(y_test, test_pred))
print(y_test, test_pred)

Without Lexicon Model with N=5000
Load a pre-trained vectorizer: tfidf_vectorizer_ngram(1, 3)_max_5000_dfminmax_3_0.7.pickle
Features:  Train (20000, 5000) , Val (5000, 5000) , Test (20, 5000)
Trainset
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     10000
           1       0.91      0.92      0.92     10000

    accuracy                           0.92     20000
   macro avg       0.92      0.92      0.92     20000
weighted avg       0.92      0.92      0.92     20000

Valset
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      2500
           1       0.87      0.89      0.88      2500

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

Testset adversarial
              precision    recall  f1-score   support

           0       0.31      0.40      0.35        10
        

### Adversarial Sentiment Flip for Deep Learning Model

For this one, we evaluate two rnn models

One is Bi-GRU model by GloVe Learnable Embedding; Another is Bi-GRU model by BERT representation

In [28]:
from transformers import BertTokenizer
from datasets import Dataset
from tokenizers import Tokenizer
import torch.utils.data as torch_data

tokenizer_pth = "./deep/imdb50_tokenizer"
tokenizer = Tokenizer.from_file(tokenizer_pth)
orig_vocab = tokenizer.get_vocab()
word_types = sorted(list(orig_vocab.keys()), key=lambda w: orig_vocab[w])
vocab = {w: i for i, w in enumerate(word_types)}
vocab_size = len(vocab)
pad_id = vocab["<pad>"]

bert_tokenizer = BertTokenizer.from_pretrained("../pretrained/bert-base-uncased")

class IMDB50(torch_data.Dataset):
    def __init__(self, 
                 text, 
                 labels, 
                 tokenizer,):
        
        self.all_text = text 
        self.all_labels = labels
        self.tokenizer = tokenizer

        self.is_bert = isinstance(tokenizer, BertTokenizer)
        
    def __len__(self):
        return len(self.all_text)

    def __getitem__(self, idx):

        if not self.is_bert:
            input_ids = torch.LongTensor(self.tokenizer.encode(self.all_text[idx]).ids)
        else:
            input_ids = self.tokenizer(self.all_text[idx], return_tensors='pt', max_length=512,
                                       padding="do_not_pad", truncation=True).input_ids.squeeze(0)

        label = torch.Tensor([self.all_labels[idx]])
        return input_ids, input_ids.size(0), label
    
import torch.nn.utils.rnn as rnn_utils
def collate_fn(batch):
    # input_id, length, label    
    batch.sort(key=lambda x: x[1], reverse=True) # sort by sequence length
    sequences, seq_lengths, targets = zip(*batch) 
    
    # Pad the sequences and stack the targets
    sequences_padded = rnn_utils.pad_sequence(sequences, padding_value=pad_id, batch_first=True)
    targets_stacked = torch.stack(targets)

    return sequences_padded, seq_lengths, targets_stacked

batch_size = 5
# dataset = {"test": Dataset.from_pandas(test) }

# test_text, test_label = dataset["test"]["text_cleaned"], dataset["test"]["label"]

testset = IMDB50(test_texts, test_labels, tokenizer)
testset_adv = IMDB50(test_adv_texts, test_adv_labels, tokenizer)
testset_bert = IMDB50(test_texts, test_labels, bert_tokenizer)
testset_bert_adv = IMDB50(test_adv_texts, test_adv_labels, bert_tokenizer)

test_loader = torch_data.DataLoader(testset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader_adv = torch_data.DataLoader(testset_adv, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader_bert = torch_data.DataLoader(testset_bert, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader_bert_adv = torch_data.DataLoader(testset_bert_adv, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(next(iter(test_loader))[0].shape, next(iter(test_loader))[1], next(iter(test_loader))[2].shape)
print(next(iter(test_loader_bert))[0].shape, next(iter(test_loader_bert))[1], next(iter(test_loader_bert))[2].shape)

torch.Size([5, 24]) (24, 23, 23, 21, 8) torch.Size([5, 1])
torch.Size([5, 31]) (31, 29, 28, 26, 11) torch.Size([5, 1])


In [29]:
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [30]:
model_pth = "./gru_rnn_glove_learnable_embedding_best.pt"
model1 = RNNBinarySequenceClassifier(
        vocab_size=vocab_size, embedding_size=256, hidden_size=256, output_size=1,
        num_layers=2, embedding_dropout=.3, output_dropout=.3, rnn_dropout=.3,
        rnn_base_cell="gru", embedding_type="glove", learnable=True, bidirectional=True, vocab=vocab
)
model1.load_state_dict(torch.load(model_pth))
model1 = model1.to(device)

model1.eval()
with torch.no_grad():
    pred_labels, true_labels = [], []
    for i, (input_ids, lengths, labels) in tqdm(enumerate(test_loader)):
        input_ids, labels = input_ids.to(device), labels.to(device)
        preds = model1.predict((input_ids, lengths))

        pred_labels.extend(preds.squeeze(-1).tolist())
        true_labels.extend(labels.squeeze(-1).tolist())

    acc = accuracy_score(true_labels, pred_labels)
    print(classification_report(true_labels, pred_labels))
    print("RAW Accuracy: ", round(acc, 6))
    print(true_labels, pred_labels)

    pred_labels, true_labels = [], []
    for i, (input_ids, lengths, labels) in tqdm(enumerate(test_loader_adv)):
        input_ids, labels = input_ids.to(device), labels.to(device)
        preds = model1.predict((input_ids, lengths))

        pred_labels.extend(preds.squeeze(-1).tolist())
        true_labels.extend(labels.squeeze(-1).tolist())

    acc = accuracy_score(true_labels, pred_labels)
    print(classification_report(true_labels, pred_labels))
    print("ADV Accuracy: ", round(acc, 6))
    print(true_labels, pred_labels)

Initialize by GLoVE word embedding


4it [00:00, 19.18it/s]


              precision    recall  f1-score   support

         0.0       1.00      0.90      0.95        10
         1.0       0.91      1.00      0.95        10

    accuracy                           0.95        20
   macro avg       0.95      0.95      0.95        20
weighted avg       0.95      0.95      0.95        20

RAW Accuracy:  0.95
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


4it [00:00, 19.03it/s]

              precision    recall  f1-score   support

         0.0       0.71      1.00      0.83        10
         1.0       1.00      0.60      0.75        10

    accuracy                           0.80        20
   macro avg       0.86      0.80      0.79        20
weighted avg       0.86      0.80      0.79        20

ADV Accuracy:  0.8
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]





In [31]:
model_pth = "./gru_rnn_bert_representation_best.pt"
model2 = RNNBinarySequenceClassifier(
        vocab_size=vocab_size, embedding_size=768, hidden_size=256, output_size=1,
        num_layers=2, embedding_dropout=.3, output_dropout=.3, rnn_dropout=.3,
        rnn_base_cell="gru", embedding_type="bert", learnable=False, bidirectional=True, vocab=vocab,
)

model2.load_state_dict(torch.load(model_pth), strict=False)
model2 = model2.to(device)

model2.eval()
with torch.no_grad():
    pred_labels, true_labels = [], []
    for i, (input_ids, lengths, labels) in tqdm(enumerate(test_loader_bert)):
        input_ids, labels = input_ids.to(device), labels.to(device)
        preds = model2.predict((input_ids, lengths))

        pred_labels.extend(preds.squeeze(-1).tolist())
        true_labels.extend(labels.squeeze(-1).tolist())

    acc = accuracy_score(true_labels, pred_labels)
    print(classification_report(true_labels, pred_labels))
    print("RAW Accuracy: ", round(acc, 6))
    print(true_labels, pred_labels)

    pred_labels, true_labels = [], []
    for i, (input_ids, lengths, labels) in tqdm(enumerate(test_loader_bert_adv)):
        input_ids, labels = input_ids.to(device), labels.to(device)
        preds = model2.predict((input_ids, lengths))

        pred_labels.extend(preds.squeeze(-1).tolist())
        true_labels.extend(labels.squeeze(-1).tolist())

    acc = accuracy_score(true_labels, pred_labels)
    print(classification_report(true_labels, pred_labels))
    print("ADV Accuracy: ", round(acc, 6))
    print(true_labels, pred_labels)

Use BERT representation [fixed]


4it [00:00,  5.03it/s]


              precision    recall  f1-score   support

         0.0       1.00      0.80      0.89        10
         1.0       0.83      1.00      0.91        10

    accuracy                           0.90        20
   macro avg       0.92      0.90      0.90        20
weighted avg       0.92      0.90      0.90        20

RAW Accuracy:  0.9
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


4it [00:00,  6.57it/s]

              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91        10
         1.0       1.00      0.80      0.89        10

    accuracy                           0.90        20
   macro avg       0.92      0.90      0.90        20
weighted avg       0.92      0.90      0.90        20

ADV Accuracy:  0.9
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] [1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



