# Creating Sarcasm dataset

In [3]:
# Sarcasm corpus: https://github.com/ef2020/SarcasmAmazonReviewsCorpus
import patoolib
patoolib.extract_archive("./sarcasm-detection/Regular.rar", outdir="./sarcasm-detection")
patoolib.extract_archive("./sarcasm-detection/Ironic.rar", outdir="./sarcasm-detection")

patool: Extracting ./sarcasm-detection/Regular.rar ...
patool: running "C:\Program Files\7-Zip\7z.EXE" x -o./sarcasm-detection -- ./sarcasm-detection/Regular.rar
patool: ... ./sarcasm-detection/Regular.rar extracted to `./sarcasm-detection'.


'./sarcasm-detection'

In [32]:
import bs4 as bs
import os 
import pandas as pd

#Extract sarcastic reviews
review = []
file_name = []
for file in os.listdir("./sarcasm-detection/Ironic"):
    if file.endswith(".txt"):
        file_name.append(file)
for size in range(len(file_name)):
    f = open("./sarcasm-detection/Ironic/"+file_name[size])
    soup = f.read()
    b = bs.BeautifulSoup(soup)
    review.append(b.find('review').text)

lb = [1 for k in range(len(review))]
filtered_review = []
for x in review:
    filtered_review.append(x.replace("\n", ""))
df = pd.DataFrame(filtered_review, columns=['review'])
df['sarcasm_lb'] = lb
print(df)

                                                review  sarcasm_lb
0    I can not believe my eyes, or my ears.... The ...           1
1    Journalist Weisberg here cashes in on the curr...           1
2    When my friend purchased and forced me to watc...           1
3    Does the publisher seriously think anyone is g...           1
4    This toy would be a lot more realistic with ab...           1
..                                                 ...         ...
432  Ever notice in some of the reviews of this and...           1
433  This is is some of the best dick I've ever had...           1
434  if your not to old to lose most of your olfact...           1
435  I mean, I always wanted my crotch and my hands...           1
436  Seems like a good quality product, except that...           1

[437 rows x 2 columns]


In [33]:
#Extract normal reviews
review = []
file_name = []
for file in os.listdir("./sarcasm-detection/Regular"):
    if file.endswith(".txt"):
        file_name.append(file)
for size in range(len(file_name)):
    f = open("./sarcasm-detection/Regular/"+file_name[size])
    soup = f.read()
    b = bs.BeautifulSoup(soup)
    review.append(b.find('review').text)

lb = [0 for k in range(len(review))]
filtered_review = []
for x in review:
    filtered_review.append(x.replace("\n", ""))
ndf = pd.DataFrame(filtered_review, columns=['review'])
ndf['sarcasm_lb'] = lb
print(ndf)

                                                review  sarcasm_lb
0    They are amazingly thin, which is quite impres...           0
1    First, I am not in the expected fan base for J...           0
2    I shot this using the Kodak PlaySport while sn...           0
3    The keyboard on this notebook is the highlight...           0
4    With a little peanut butter and jelly, these t...           0
..                                                 ...         ...
812  This product is fantastic!  If you're looking ...           0
813  The Scent of Rain and Lightning by Nancy Picka...           0
814  Ok, I am an old time monkey Island game fan.  ...           0
815  I got this as a Christmas present for my broth...           0
816  this stand does not work with either of my bik...           0

[817 rows x 2 columns]


In [44]:
#combine both df to create a csv dataset
save_df = pd.concat([df,ndf])
save_df.to_csv("./sarcasm-detection/sarcasm-dataset.csv",index=False)

In [45]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(save_df, test_size=0.2)
train.to_csv("./sarcasm-detection/train_data.csv",index=False)
test.to_csv("./sarcasm-detection/test_data.csv",index=False)

# Roberta for sarcasm detection

In [36]:
import pandas as pd
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from tqdm import tqdm

In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_df = pd.read_csv("./sarcasm-detection/sarcasm-dataset.csv",header=0)
new_df = train_df[['review', 'sarcasm_lb']]

MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

In [40]:
class Tokenize(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = dataframe['review']
        self.targets = dataframe['sarcasm_lb']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [41]:
training_set = Tokenize(train_df, tokenizer, MAX_LEN)
trainloader = DataLoader(training_set, batch_size = TRAIN_BATCH_SIZE, shuffle = True, num_workers=0)

In [43]:
class Roberta(torch.nn.Module):
    def __init__(self):
        super(Roberta, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-large")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(1024, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
EPOCHS = 3
PATH = './sarcasm-detection/roberta-large.pt'
model = Roberta()
# model.load_state_dict(torch.load(PATH))
model.to(device)
m = nn.Sigmoid()
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
for epoch in range(EPOCHS):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(trainloader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(m(outputs), targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
    torch.save(model.state_dict(), PATH)
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
 