In [2]:
import pandas as pd
import string
import glob
import re

In [3]:
# import all csv files that are in the dataset folder
all_files = glob.glob("data/*.csv")

# concatenate all the dataframes in the list
df = pd.concat([pd.read_csv(file, index_col=None, header=0) for file in all_files], axis=0, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,author,updated_at,like_count,text,video_id,public
0,@aichaabid1244,2023-12-17T11:28:07Z,0,Chouha Chouha Atfou Atfou,2E3NYSgdnCA,True
1,@animeworldamv5052,2023-09-17T16:55:38Z,0,والله ياخد فيكم الحق أ شوف تيفي اللي كتجيبو لي...,2E3NYSgdnCA,True
2,@user-vl7bi8cn4g,2023-09-09T22:35:46Z,0,امراءة لوكانت في الجزاءر عظم الله اجراكم,2E3NYSgdnCA,True
3,@wassifsiham3156,2023-08-30T10:34:17Z,0,هادي وحكمتوها واخنوش اش نديرو فيه هادي راه...,2E3NYSgdnCA,True
4,@FatihaOudrhiri-np2ll,2023-07-29T18:35:45Z,0,Il faut fermer ce youtoub car c est honteux de...,2E3NYSgdnCA,True


In [14]:
# Create a new Dataframe that containes only the columns and label 
new_df = df[["text"]].dropna()
new_df["label"] = 0

In [20]:
new_df.head()

Unnamed: 0,text,label
1,والله ياخد فيكم الحق أ شوف تيفي اللي كتجيبو لي...,0
2,امراءة لوكانت في الجزاءر عظم الله اجراكم,0
3,هادي وحكمتوها واخنوش اش نديرو فيه هادي راه...,0
5,واك واك اعباد الله واك واك😂😂😂😂,0
6,سينياليو القناة لي كتشجع على الرذيلة والفساد,0


In [16]:
# only keep the rows that have text in arabic
new_df = new_df[new_df['text'].str.contains(u'[\u0600-\u06FF]')]

# drop the rows that have more than 1 line of text
new_df = new_df[~new_df['text'].str.contains('\n')]

# drop the rows that have less than 3 words
new_df = new_df[new_df['text'].str.count(' ') >= 2]

# remove rows that have links
new_df = new_df[~new_df['text'].str.contains('http')]

# remove rows that have mentions
new_df = new_df[~new_df['text'].str.contains('@')]

# remove duplicate rows
new_df = new_df.drop_duplicates(subset=['text'])

In [None]:
# remove emojis from comment
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        "]+", re.UNICODE)
    return emoji_pattern.sub(r'', text)

# remove punctuation from comment
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# remove numbers from comment
def remove_numbers(text):
    return text.translate(str.maketrans('', '', string.digits))

# remove arabic ponctuation from comment
def remove_arabic_punctuation(text):
    arabic_punctuation = '؟،؛۔۰۱۲۳۴۵۶۷۸۹'
    return text.translate(str.maketrans('', '', arabic_punctuation))

# remove english characters from comment
def remove_english_characters(text):
    english_characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
    return text.translate(str.maketrans('', '', english_characters))

def remove_repetition(text):
    return re.sub(r'(.)\1+', r'\1', text) # Replace with only one (remove repetitions)  

def clean_text(text):
    text = remove_emoji(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_arabic_punctuation(text)
    text = remove_english_characters(text)
    text = remove_repetition(text)
    return text

new_df = df['text'].apply(lambda x: clean_text(x))

In [None]:
new_df.to_csv("data/train_clean.csv")