In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/toddgavin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Add a flag for sarcasm by comparing the post text to https://nlds.soe.ucsc.edu/sarcasm1. Compare the text of the post with the sarcasm corpus and add a flag for detected or not.

# Sarcasm Text

In [2]:
sarcasmTextList = []

for i in range(1, 1000):
    
    try: 
        # Open file in read mode
        with open('data-sarc-sample/sarc/sarcastic_' + str(i) + '.txt', 'r') as file:
            # Read the entire contents of the file into a string variable
            sarcasm_text = file.read()
    
        sarcasmTextList.append(sarcasm_text)
        
    finally:
        continue

In [3]:
# Create a pandas DataFrame with the list of strings
df_sarcasm = pd.DataFrame({'Sarcasm Text': sarcasmTextList})

In [4]:
df_sarcasm

Unnamed: 0,Sarcasm Text
0,"Actually, they didn't. The whole tragedy was c..."
1,At your service: Comparison I could've jus...
2,"So which is it: the action is moral, the actio..."
3,Interesting how the study was set in Pittsburg...
4,"Ah, I see. Your reasons are secret reasons. ..."
...,...
992,and
993,"Ha, that is just an idiotic perspective. We'd ..."
994,So you are saying that despite the majority of...
995,"depends on your definition of ""human being."""


# Non-Sarcasm Text

In [5]:
notSarcasmTextList = []

for i in range(1, 1000):
    
    try: 
        # Open file in read mode
        with open('data-sarc-sample/notsarc/not_sarcastic_' + str(i) + '.txt', 'r') as file:
            # Read the entire contents of the file into a string variable
            not_sarcasm_text = file.read()
    
        notSarcasmTextList.append(not_sarcasm_text)
        
    finally:
        continue

In [6]:
# Create a pandas DataFrame with the list of strings
df_not_sarcasm = pd.DataFrame({'Not Sarcasm Text': notSarcasmTextList})

In [7]:
df_not_sarcasm

Unnamed: 0,Not Sarcasm Text
0,"This is a pretty touchy issue, and I agree wit..."
1,See above
2,"In other words, you think a Supreme Court deci..."
3,"um, yeah, you could say that... (won't argue....."
4,"Well, Google is your friend here - try fossil ..."
...,...
991,"Thanks, my friend, as you can see, Jito comple..."
992,What do you mean by this? Could we not have th...
993,And the answer is: we don't know. Maybe it cam...
994,And what would make them separate species? How...


# Implementing Flag Detection

In [8]:
df_master_dataset = pd.read_csv("../Master_Dataset_Raw.csv")

In [12]:
# create the target variable
df_sarcasm['label'] = 1
df_not_sarcasm['label'] = 0

# combine the two dataframes into one
df_combined = pd.concat([df_sarcasm, df_not_sarcasm], ignore_index=True)

df_combined

# split the combined dataframe into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(df_combined['text'], df_combined['label'], test_size=0.2, random_state=42)

Unnamed: 0,Sarcasm Text,label,Not Sarcasm Text
0,"Actually, they didn't. The whole tragedy was c...",1,
1,At your service: Comparison I could've jus...,1,
2,"So which is it: the action is moral, the actio...",1,
3,Interesting how the study was set in Pittsburg...,1,
4,"Ah, I see. Your reasons are secret reasons. ...",1,
...,...,...,...
1988,,0,"Thanks, my friend, as you can see, Jito comple..."
1989,,0,What do you mean by this? Could we not have th...
1990,,0,And the answer is: we don't know. Maybe it cam...
1991,,0,And what would make them separate species? How...


In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # remove punctuation and convert to lowercase
    text = ''.join([word.lower() for word in text if word.isalnum() or word.isspace()])
    # remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [None]:
# apply the preprocessing function to the train and test sets
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [None]:
# initialize the vectorizer
vectorizer = TfidfVectorizer()

# fit and transform the vectorizer on the train set
X_train_vectorized = vectorizer.fit_transform(X_train)

# transform the test set
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# initialize the model
model = MultinomialNB()

# fit the model on the vectorized train set
model.fit(X_train_vectorized, y_train)

In [None]:
# preprocess the text in the 'Narrative' column
df_master_dataset['Narrative'] = df_master_dataset['Narrative'].apply(preprocess)

# vectorize the text in the 'Narrative' column
narrative_vectorized = vectorizer.transform(df_master_dataset['Narrative'])

# make predictions on the vectorized 'Narrative' column
predictions = model.predict(narrative_vectorized)

# add the predictions as a new column to the 'df_master_dataset'
df_master_dataset['Sarcasm'] = predictions