In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

## iSarcasmEval

In [2]:
isarc_test_path = r"isarcasm-data\task_A_En_test.csv"
df_isarc_test = pd.read_csv(isarc_test_path)
df_isarc_test.head()

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0


In [3]:
df_isarc_test['sarcastic'] = df_isarc_test['sarcastic'].astype('int')

In [4]:
df_isarc_test.to_csv(r"iSarcasmEval\test.csv", index=False)

In [5]:
isarc_train_path = r"isarcasm-data\train.EN.csv"
df_isarc_train = pd.read_csv(isarc_train_path)
df_isarc_train.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_isarc_new_train = df_isarc_train.loc[:,['tweet','sarcastic']]
df_isarc_new_train.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [7]:
df_isarc_new_train.rename(columns = {'tweet':'text'}, inplace = True)
df_isarc_new_train.head()

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [8]:
df_isarc_new_train['sarcastic'] = df_isarc_new_train['sarcastic'].astype('int')

In [9]:
df_isarc_train_final, df_isarc_val_final = train_test_split(df_isarc_new_train, train_size = 0.85, stratify = df_isarc_new_train["sarcastic"], random_state = 10)

In [10]:
df_isarc_train_final.head()

Unnamed: 0,text,sarcastic
2019,"Fuck, an Insomniac developed Wolverine game?! ...",0
1346,There are people who say there is no hope left...,0
832,"Well, what a way to ruin Easter. Thanks @Cadbu...",1
1130,excited to start my new hobby of standing in t...,0
2117,Nothing quite says “back to work January blues...,0


In [11]:
df_isarc_val_final.head()

Unnamed: 0,text,sarcastic
147,Taxes are just the best and I cannot wait to ...,1
1322,sadly the only reason i stop making tiktoks is...,0
2251,Humans are the only animal that can’t tell fan...,0
911,"@miiniigun I feel that 100%, my anxiety stops ...",0
3279,biting my nails,0


In [12]:
df_isarc_train_final['sarcastic'].value_counts()

sarcastic
0    2210
1     737
Name: count, dtype: int64

In [13]:
df_isarc_val_final['sarcastic'].value_counts()

sarcastic
0    391
1    130
Name: count, dtype: int64

In [14]:
df_isarc_train_final.to_csv(r"iSarcasmEval\train.csv", index=False)
df_isarc_val_final.to_csv(r"iSarcasmEval\val.csv", index=False)

## Sarcasm Corpus V2

In [15]:
df_scv2_gen = pd.read_csv(r"sarcasm_v2\GEN-sarc-notsarc.csv")
df_scv2_hyp = pd.read_csv(r"sarcasm_v2\HYP-sarc-notsarc.csv")
df_scv2_rq = pd.read_csv(r"sarcasm_v2\RQ-sarc-notsarc.csv")
df_scv2 = pd.concat([df_scv2_gen, df_scv2_hyp, df_scv2_rq])
df_scv2.head()

Unnamed: 0,class,id,text
0,notsarc,1,"If that's true, then Freedom of Speech is doom..."
1,notsarc,2,Neener neener - is it time to go in from the p...
2,notsarc,3,"Just like the plastic gun fear, the armour pie..."
3,notsarc,4,So geology is a religion because we weren't he...
4,notsarc,5,Well done Monty. Mark that up as your first ev...


In [16]:
df_scv2_final = df_scv2.loc[:, ['text', 'class']].copy()
df_scv2_final.rename(columns={"class": "sarcastic"}, inplace=True)
df_scv2_final['sarcastic'] = df_scv2_final['sarcastic'].replace({"sarc": 1, "notsarc": 0}).astype(int)
df_scv2_final.head()

Unnamed: 0,text,sarcastic
0,"If that's true, then Freedom of Speech is doom...",0
1,Neener neener - is it time to go in from the p...,0
2,"Just like the plastic gun fear, the armour pie...",0
3,So geology is a religion because we weren't he...,0
4,Well done Monty. Mark that up as your first ev...,0


In [17]:
df_scv2_final = df_scv2_final.sample(frac=1, random_state = 10).reset_index(drop=True)
df_scv2_final.head()

Unnamed: 0,text,sarcastic
0,"Okay, so if we only need 10 laws then they mus...",1
1,"Actually, scientists who discard the global fl...",0
2,Your posts read like gibberish when you fail t...,1
3,"call it what you will - cry, whine, pout, 'put...",1
4,And there is still no evidence on the table fo...,1


In [18]:
df_scv2_train, df_scv2_final_test = train_test_split(df_scv2_final, train_size = 0.80, stratify = df_scv2_final["sarcastic"], random_state = 10)
df_scv2_final_train, df_scv2_final_val = train_test_split(df_scv2_train, train_size = 0.85, stratify = df_scv2_train["sarcastic"], random_state = 10)

In [19]:
df_scv2_final_train.head()

Unnamed: 0,text,sarcastic
2465,wow - sounds like a terrible idea. luckily we ...,0
8134,But a business is just a private citizen! Why ...,1
7808,"thewayitis, i would say is one of the best deb...",0
9111,"The lovely thing about our system of justice, ...",1
6473,And the wonders all around us show us how. . .,0


In [20]:
df_scv2_final_test.head()

Unnamed: 0,text,sarcastic
6150,"was it a 'sexual based movement', as you've ca...",0
7005,"Do you even read what anyone posts? Try it, yo...",1
6498,"So funny... Like others have pointed out, this...",0
4082,What about the similarities between the museum...,0
3371,me whine? this entire f ing thread is devoted...,1


In [21]:
df_scv2_final_val.head()

Unnamed: 0,text,sarcastic
5092,You wouldn't. More likely you'll pursue your W...,1
7740,He did finally figure out how to use the quote...,1
7931,I think the FSM threw these guys a bone. They ...,0
2341,The Bible does not say that it is inerrant. An...,0
5754,Don't you ever get tired of acting like a litt...,1


In [22]:
df_scv2_final_train['sarcastic'].value_counts()

sarcastic
0    3191
1    3190
Name: count, dtype: int64

In [23]:
df_scv2_final_test['sarcastic'].value_counts()

sarcastic
0    939
1    939
Name: count, dtype: int64

In [24]:
df_scv2_final_val['sarcastic'].value_counts()

sarcastic
1    564
0    563
Name: count, dtype: int64

In [25]:
df_scv2_final_train.to_csv(r"Sarcasm_Corpus_V2\train.csv", index=False)
df_scv2_final_test.to_csv(r"Sarcasm_Corpus_V2\test.csv", index=False)
df_scv2_final_val.to_csv(r"Sarcasm_Corpus_V2\val.csv", index=False)

## Combining

In [26]:
df_train = pd.concat([df_isarc_train_final, df_scv2_final_train]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_test = pd.concat([df_isarc_test, df_scv2_final_test]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_val = pd.concat([df_isarc_val_final, df_scv2_final_val]).sample(frac=1, random_state = 10).reset_index(drop=True)

In [27]:
df_train['sarcastic'].value_counts()

sarcastic
0    5401
1    3927
Name: count, dtype: int64

In [28]:
df_test['sarcastic'].value_counts()

sarcastic
0    2139
1    1139
Name: count, dtype: int64

In [29]:
df_val['sarcastic'].value_counts()

sarcastic
0    954
1    694
Name: count, dtype: int64

In [30]:
df_train.to_csv(r"combined\train.csv", index=False)
df_test.to_csv(r"combined\test.csv", index=False)
df_val.to_csv(r"combined\val.csv", index=False)