# Data Preprocessing

In [1]:
import pandas as pd
from preprocess import Preprocess
from sklearn.model_selection import train_test_split
from arabert.preprocess import ArabertPreprocessor

In [2]:
arHate_dir="./arHateDataset.csv"
dataset_name = 'arHAteDataset'
model_name = 'UBC-NLP/MARBERTv2'
max_len = 128
LABELS=[0,1]
DATA_COLUMN='text'
LABEL_COLUMN='labels'

In [3]:
# Uploading Dataset
df=pd.read_csv(arHate_dir)
df=df.rename(columns={'Tweet': DATA_COLUMN, 'Class': LABEL_COLUMN})

In [4]:
# Create balanced Dataframe
df_0=df[df[LABEL_COLUMN]==0]
n_hs=len(df[df[LABEL_COLUMN]==1])
idx2drop = df_0.sample(n_hs).index
df=df.drop(idx2drop)

In [5]:
# Preprocessing Data Using Custom & Arabert Preprocessing
preprocessor=Preprocess(df[DATA_COLUMN])
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])

df[DATA_COLUMN]=preprocessor.preprocess()
df[DATA_COLUMN]=df[DATA_COLUMN].apply(lambda x: arabert_prep.preprocess(x))



In [6]:
# Remove NaNs
df=df.dropna()

In [7]:
# Split and Create Dataset
train, test= train_test_split(df, test_size=0.2, random_state=1)
test, val= train_test_split(test, test_size=0.5, random_state=1)

In [8]:

# Balancing Dataset
train=pd.concat(
    [
        train[train[LABEL_COLUMN]==0].sample(
            len(train[train[LABEL_COLUMN]==1]), random_state=42, replace=False
        ),
        train[train[LABEL_COLUMN]==1],
    ])
test=pd.concat(
    [
        test[test[LABEL_COLUMN]==0].sample(
            len(test[test[LABEL_COLUMN]==1]), random_state=42, replace=False
        ),
        test[test[LABEL_COLUMN]==1],
    ])

val=pd.concat(
    [
        val[val[LABEL_COLUMN]==0].sample(
            len(val[val[LABEL_COLUMN]==1]), random_state=42, replace=False
        ),
        val[val[LABEL_COLUMN]==1],
    ])

In [9]:
# Check if Dataset is Balanced
assert len(train[train[LABEL_COLUMN]==1]) == len(train[train[LABEL_COLUMN]==0])
assert len(test[test[LABEL_COLUMN]==1]) == len(test[test[LABEL_COLUMN]==0])
assert len(val[val[LABEL_COLUMN]==1]) == len(val[val[LABEL_COLUMN]==0])

In [10]:
# Resetting Indexing 
train=train.reset_index(drop=True)
test=test.reset_index(drop=True)
val=val.reset_index(drop=True)

In [11]:
train

Unnamed: 0,text,labels
0,انا من رايي نشجع المراءه على اكمال دراستها وتح...,0
1,لو العيال يجروا ورا مستقبلهم زى ما بيجروا ورا ...,0
2,كلنا مسوول علاج مرض كورونا هو البقاء في المناز...,0
3,قبل كنت اقرا منشورا لاحد الاشخاص يعتبر نفسه عل...,0
4,من وين عم تجيب مصاري حتى تدفع قسط ابنك بلندن,0
...,...,...
17445,انت بس عم تكتب لتفرجي الناس انك بتطااع بالكتب ...,1
17446,خنازير البنا,1
17447,خنازير ومرتزقة قناة الجزيرة بدي انذر حكومة الد...,1
17448,لقدصار الحريري مثل هادي لامريكا الذريعة والايا...,1


In [12]:
# Saving Data to .csv Files
train.to_csv("train-balanced.csv")
test.to_csv("test-balanced.csv")
test.to_csv("val-balanced.csv")