In [1]:
import preprocessor as p 
import re
import wordninja
import csv
import pandas as pd


# Data Loading
def load_data(filename):

    filename = [filename]
    concat_text = pd.DataFrame()
    raw_text = pd.read_csv(filename[0],usecols=[0], encoding='ISO-8859-1')
    raw_label = pd.read_csv(filename[0],usecols=[2], encoding='ISO-8859-1')
    raw_target = pd.read_csv(filename[0],usecols=[1], encoding='ISO-8859-1')
    label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])
    concat_text = pd.concat([raw_text, label, raw_target], axis=1)
    concat_text = concat_text[concat_text.Stance != 2]
    
    return(concat_text)


# Data Cleaning
def data_clean(strings, norm_dict):
    
    p.set_options(p.OPT.URL,p.OPT.EMOJI,p.OPT.RESERVED)
    clean_data = p.clean(strings)  # using lib to clean URL, emoji...
    clean_data = re.sub(r"#SemST", "", clean_data)
    clean_data = re.findall(r"[A-Za-z#@]+|[,.!?&/\<>=$]|[0-9]+",clean_data)
    clean_data = [[x.lower()] for x in clean_data]
    
    for i in range(len(clean_data)):
        if clean_data[i][0] in norm_dict.keys():
            clean_data[i][0] = norm_dict[clean_data[i][0]]
            continue
        if clean_data[i][0].startswith("#") or clean_data[i][0].startswith("@"):
            clean_data[i] = wordninja.split(clean_data[i][0]) # split compound hashtags
    clean_data = [j for i in clean_data for j in i]

    return clean_data


# Clean All Data
def clean_all(filename, norm_dict):
    
    concat_text = load_data(filename)
    raw_data = concat_text['Tweet'].values.tolist() 
    label = concat_text['Stance'].values.tolist()
    x_target = concat_text['Target'].values.tolist()
    clean_data = [None for _ in range(len(raw_data))]
    
    for i in range(len(raw_data)):
        clean_data[i] = data_clean(raw_data[i], norm_dict)
        x_target[i] = data_clean(x_target[i], norm_dict)
        
        clean_data[i] = ' '.join(clean_data[i])
        x_target[i] = ' '.join(x_target[i])
        
    return clean_data,label,x_target

In [2]:
import json
with open("noslang_data.json", "r") as f:
    data1 = json.load(f)
data2 = {}
with open("emnlp_dict.txt","r") as f:
    lines = f.readlines()
    for line in lines:
        row = line.split('\t')
        data2[row[0]] = row[1].rstrip()
normalization_dict = {**data1,**data2}

In [18]:
target = ['bernie', 'biden', 'trump'][1]

In [19]:
filename1 = f'raw_train_{target}.csv'
filename2 = f'raw_val_{target}.csv'
filename3 = f'raw_test_{target}.csv'
x_train,y_train,x_train_target = clean_all(filename1, normalization_dict)
x_val,y_val,x_val_target = clean_all(filename2, normalization_dict)
x_test,y_test,x_test_target = clean_all(filename3, normalization_dict)

In [20]:
df_train = pd.DataFrame({'text': x_train, 'target': x_train_target, 'label': y_train})
df_val = pd.DataFrame({'text': x_val, 'target': x_val_target, 'label': y_val})
df_test = pd.DataFrame({'text': x_test, 'target': x_test_target, 'label': y_test})

In [21]:
df_train

Unnamed: 0,text,target,label
0,joe biden is looking to gather votes from unsu...,joe biden,0
1,check out the latest podcast conversation betw...,joe biden,1
2,thank you secretary clinton for your endorseme...,joe biden,1
3,happening now joe biden kicking off hispanic h...,joe biden,1
4,thank you mayor kei sha bottoms for opening ou...,joe biden,1
...,...,...,...
5801,call me stubborn but i just don think i want t...,joe biden,0
5802,crazy liberals laughing out loud progressive p...,joe biden,0
5803,lots of students uw madison awaiting joe biden...,joe biden,1
5804,"other than the terrible grammar , is biden jus...",joe biden,0


In [22]:
df_val

Unnamed: 0,text,target,label
0,joe gives us solutions . joe gives us a clear ...,joe biden,1
1,precisely . vote by mail is not a partisan iss...,joe biden,1
2,nebraska for biden virtual game event ! join u...,joe biden,1
3,just got to meet joe biden and thank him for h...,joe biden,1
4,love to see this ! fighting for the soul of ou...,joe biden,1
...,...,...,...
740,a lot of movement in the lines after the first...,joe biden,1
741,joe biden you should have really considered yo...,joe biden,1
742,djtj wishes he was hunter so that joe biden wa...,joe biden,1
743,"hey germany , denial of normal t listen to joe...",joe biden,0


In [23]:
df_test

Unnamed: 0,text,target,label
0,5 / 23 sat too 30 p edt michigan biden trainin...,joe biden,1
1,why is joe biden a good candidate ? tell the t...,joe biden,1
2,yes he does . thank you dr biden for all of yo...,joe biden,1
3,"attention high school students for biden , the...",joe biden,1
4,"i have so much respect a myk lob u char , and ...",joe biden,1
...,...,...,...
740,no one was even close in debate skills to kama...,joe biden,1
741,"kendra , joe , uou should get out of politics ...",joe biden,0
742,we all know how advanced the ukrainians are in...,joe biden,0
743,harris i think she interjected pointing out th...,joe biden,1


In [24]:
df_train.to_csv(f'processed_train_{target}.csv')
df_val.to_csv(f'processed_val_{target}.csv')
df_test.to_csv(f'processed_test_{target}.csv')