In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

## iSarcasmEval

In [2]:
isarc_test_path = r"isarcasm-data\task_A_En_test.csv"
df_isarc_test = pd.read_csv(isarc_test_path)
df_isarc_test.head()

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0


In [3]:
df_isarc_test['sarcastic'] = df_isarc_test['sarcastic'].astype('int')

In [4]:
df_isarc_test.to_csv(r"iSarcasmEval\test.csv", index=False)

In [5]:
isarc_train_path = r"isarcasm-data\train.EN.csv"
df_isarc_train = pd.read_csv(isarc_train_path)
df_isarc_train.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df_isarc_new_train = df_isarc_train.loc[:,['tweet','sarcastic']]
df_isarc_new_train.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [7]:
df_isarc_new_train.rename(columns = {'tweet':'text'}, inplace = True)
df_isarc_new_train.head()

Unnamed: 0,text,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [8]:
df_isarc_new_train['sarcastic'] = df_isarc_new_train['sarcastic'].astype('int')

In [9]:
df_isarc_train_final, df_isarc_val_final = train_test_split(df_isarc_new_train, train_size = 0.90, stratify = df_isarc_new_train["sarcastic"], random_state = 10)

In [10]:
df_isarc_train_final.head()

Unnamed: 0,text,sarcastic
1253,Good morning to everyone except Tristan Thomps...,0
3359,i hope every sweet girl that came in to my job...,0
222,NOT OLIVIA RODRIGO TRAITOR IN THE BACKGROUND #...,1
303,yes i have little interest in doing things tha...,1
2998,I need like 6 weeks on a vacation with my bffs...,0


In [11]:
df_isarc_val_final.head()

Unnamed: 0,text,sarcastic
1227,@micahgallen @rosanna_olsen I am just looking ...,0
2381,I yearn to sing like Christy Altomare so badly.,0
257,@McDonaldsUK Would love a cheeky McDonald’s br...,1
858,"wow, ok, unfollowing now. Was a big fan of put...",1
3321,Not a single conservative tear then for this a...,0


In [12]:
df_isarc_train_final['sarcastic'].value_counts()

sarcastic
0    2341
1     780
Name: count, dtype: int64

In [13]:
df_isarc_val_final['sarcastic'].value_counts()

sarcastic
0    260
1     87
Name: count, dtype: int64

In [14]:
df_isarc_train_final.to_csv(r"iSarcasmEval\train.csv", index=False)
df_isarc_val_final.to_csv(r"iSarcasmEval\val.csv", index=False)

## Sarcasm Corpus V2

In [15]:
def get_splits(df):
    df = df.loc[:, ['text', 'class']].copy()
    df.rename(columns={"class": "sarcastic"}, inplace=True)
    df['sarcastic'] = df['sarcastic'].replace({"sarc": 1, "notsarc": 0}).astype(int)
    df_train, df_test = train_test_split(df, train_size = 0.80, stratify = df["sarcastic"], random_state = 10)
    df_train, df_val = train_test_split(df_train, train_size = 0.90, stratify = df_train["sarcastic"], random_state = 10)
    return df_train, df_test, df_val

### General

In [16]:
df_scv2_gen = pd.read_csv(r"sarcasm_v2\GEN-sarc-notsarc.csv")
df_gen_train, df_gen_test, df_gen_val = get_splits(df_scv2_gen)

### Hyperbole

In [17]:
df_scv2_hyp = pd.read_csv(r"sarcasm_v2\HYP-sarc-notsarc.csv")
df_hyp_train, df_hyp_test, df_hyp_val = get_splits(df_scv2_hyp)

### Rhetorical Questions

In [18]:
df_scv2_rq = pd.read_csv(r"sarcasm_v2\RQ-sarc-notsarc.csv")
df_rq_train, df_rq_test, df_rq_val = get_splits(df_scv2_rq)

In [19]:
df_scv2_train = pd.concat([df_gen_train, df_hyp_train, df_rq_train]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_scv2_train.head()

Unnamed: 0,text,sarcastic
0,"I think we should put to a vote, the right of ...",1
1,Worship me and I'll bless you too. With money ...,1
2,"Ooops, typo. Sorry, my bad. I imagine that it ...",1
3,"I love this line. I assume by ""harm"" you mean ...",1
4,"Well by the Flood, I assume you are refering t...",0


In [20]:
df_scv2_test = pd.concat([df_gen_test, df_hyp_test, df_rq_test]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_scv2_test.head()

Unnamed: 0,text,sarcastic
0,"Sure, and a 18 year old marrying a 87 year old...",1
1,"oh cc, apparently, 10% of the population has t...",1
2,It's not about how she treated cops. It's abou...,0
3,with such eloquent and compelling arguments as...,1
4,"Let me know when we're leaving, cause your rig...",1


In [21]:
df_scv2_val = pd.concat([df_gen_val, df_hyp_val, df_rq_val]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_scv2_val.head()

Unnamed: 0,text,sarcastic
0,"wow, i don't understand how this keeps happeni...",1
1,Sounds like my 9mil. is more secure under my b...,1
2,You make it seem as if you are doing me a favo...,1
3,whooooosh. emoticonXBye\r\nThat was the sound ...,1
4,i am equally astonished you two road apples do...,1


In [22]:
df_scv2_train['sarcastic'].value_counts()

sarcastic
0    3378
1    3377
Name: count, dtype: int64

In [23]:
df_scv2_test['sarcastic'].value_counts()

sarcastic
1    940
0    938
Name: count, dtype: int64

In [24]:
df_scv2_val['sarcastic'].value_counts()

sarcastic
0    377
1    376
Name: count, dtype: int64

In [25]:
df_scv2_train.to_csv(r"Sarcasm_Corpus_V2\train.csv", index=False)
df_scv2_test.to_csv(r"Sarcasm_Corpus_V2\test.csv", index=False)
df_scv2_val.to_csv(r"Sarcasm_Corpus_V2\val.csv", index=False)

## Combining

In [26]:
df_train = pd.concat([df_isarc_train_final, df_scv2_train]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_test = pd.concat([df_isarc_test, df_scv2_test]).sample(frac=1, random_state = 10).reset_index(drop=True)
df_val = pd.concat([df_isarc_val_final, df_scv2_val]).sample(frac=1, random_state = 10).reset_index(drop=True)

In [27]:
df_train['sarcastic'].value_counts()

sarcastic
0    5719
1    4157
Name: count, dtype: int64

In [28]:
df_test['sarcastic'].value_counts()

sarcastic
0    2138
1    1140
Name: count, dtype: int64

In [29]:
df_val['sarcastic'].value_counts()

sarcastic
0    637
1    463
Name: count, dtype: int64

In [30]:
df_train.to_csv(r"combined\train.csv", index=False)
df_test.to_csv(r"combined\test.csv", index=False)
df_val.to_csv(r"combined\val.csv", index=False)