In [None]:
pip install pyarabic

In [None]:
pip install nltk

In [None]:
#importing pandas
import pandas as pd

#importing the pyarabic library
import pyarabic.araby as araby

#importing the negative tweets dataset
df = pd.read_excel("/content/drive/MyDrive/original_dataset.xlsx")
#renaming the columns
df.columns = ["statement", "label"]
#Showing the first five rows.
df.head()

In [None]:
df.shape

In [None]:
#A quick check to see if there are any null values
missing = pd.DataFrame({'missing': df.isnull().sum()})
missing

In [None]:
# Remove rows where 'statement' or 'status' is empty
df = df.dropna(subset=['statement'])

In [None]:
df.shape

In [None]:
df = df.sample(frac=1)

In [None]:

import re

# Function to remove mentions from a string
def remove_mentions(text):
    return re.sub(r"@[A-Za-z0-9_-]+", "", text)

# Apply the function to the 'statement' column
df["statement"] = df["statement"].apply(remove_mentions)

# Display the updated DataFrame
df.head()

In [None]:

# Function to remove URLs from a string
def remove_urls(text):
    return re.sub(r"http[^ ]+", "", text)

# Apply the function to the 'statement' column
df["statement"] = df["statement"].apply(remove_urls)

# Display the updated DataFrame
df.head()

In [None]:
# Function to remove hashtags from a string
def remove_hashtags(text):
    return re.sub(r"#\S+", "", text)

# Apply the function to the 'statement' column
df["statement"] = df["statement"].apply(remove_hashtags)

# Display the updated DataFrame
df.head()

In [None]:

# Function to remove Arabic punctuation from a string
def remove_arabic_punctuation(text):
    # Arabic punctuation marks and additional symbols to remove
    arabic_punctuation = (
        "\u060C\u061B\u061F\u066A\u066B\u066C\u066D\u06D4"
        "\u06D6\u06D7\u06D8\u06D9\u06DA\u06DB\u06DC\u06DD"
        "\u06DE\u06DF\u06E0\u06E1\u06E2\u06E3\u06E4\u06E5"
        "\u06E6\u06E7\u06E8\u06E9\u06EA\u06EB\u06EC\u06ED"
        "\u0021\u002E\u002C\u003F\u003B\u003A\u0028\u0029"
        "\u005B\u005D\u007B\u007D\u003C\u003E\u0022\u0027"
        "\u2018\u2019\u201C\u201D\u00AB\u00BB\u2039\u203A"
    )

    # Escape characters that need escaping in regex
    escaped_punctuation = re.escape(arabic_punctuation)

    # Create a regex pattern for punctuation
    pattern = f"[{escaped_punctuation}]"

    # Remove punctuation from the text
    cleaned_text = re.sub(pattern, ' ', text)

    return cleaned_text

# Apply the function to the 'statement' column
df["statement"] = df["statement"].apply(remove_arabic_punctuation)

# Display the updated DataFrame
df.head()

In [None]:

# Function to normalize Arabic text
def normalize_arabic(text):
    # Define normalization patterns
    normalization_patterns = [
        (r"[ةه]", "ه"),  # Normalize ة and ه to ه
        (r"[أاإآ]", "ا"),  # Normalize أ, ا, and آ to ا
        (r"[يى]", "ي"),  # Normalize ي and ى to ي
        (r"[ؤئ]", "ء")   # Normalize ؤ and ئ to ء
    ]

    # Apply each normalization pattern
    for pattern, replacement in normalization_patterns:
        text = re.sub(pattern, replacement, text)

    return text

# Apply the normalization function to the 'statement' column
df["statement"] = df["statement"].apply(normalize_arabic)
print(normalize_arabic("أنا يحيى أخ هالة مؤيد للدراسة"))
# Display the updated DataFrame
df.head()

In [None]:


# Function to remove repetition of elongation letters
def remove_repeated_madd(text):
    # Replace repeated elongation letters with a single occurrence
    return re.sub(r"(ا{2,}|و{2,}|ي{2,})", lambda match: match.group(0)[0], text)

# Apply the function to the 'statement' column
df["statement"] = df["statement"].apply(remove_repeated_madd)
print(remove_repeated_madd("كووووورنااااااييي"))
# Display the updated DataFrame
df.head()

In [None]:
def clean_arabic_text(text):
    text = araby.strip_tashkeel(text)  # Remove tashkeel
    text = araby.strip_tatweel(text)  # Remove tatweel
    text = re.sub(r"[0-9]", " ", text)  # Remove numbers
    return text

# Apply the function to the 'statement' column
df["statement"] = df["statement"].apply(clean_arabic_text)
print(clean_arabic_text("أنَا يحيى وعمري 24"))
# Display the updated DataFrame
df.head()

أنا يحيى وعمري   


Unnamed: 0,statement,label
3880,انا غبي غبي لا استطيع التعلم قبيح غير جذاب...,Suicidal
761,اختفي البوم Flight of the Conchords من Spotify...,Normal
5939,خاءفه للغايه لانني مصاب بفيروس هانتا وقد انتق...,Anxiety
15005,الان الاخ الذي ارسل لي رساله نصيه كان لديه طف...,Stress
7001,تحاول معرفه كيف قد اموت يوم الاحد حيث ساترك و...,Suicidal


In [None]:
# Function to count words in a text
def word_count(text):
    return len(text.split())

# Filter the DataFrame to keep rows where the statement column has 3 or more words
df = df[df["statement"].apply(word_count) >= 3]
df.shape

(20456, 2)

In [None]:
# Save the DataFrame to an Excel file
output_file = "cleaned_original_dataset.xlsx"
df.to_excel(output_file, index=False)

print(f"DataFrame saved to {output_file}")

In [None]:
# save to output data folder
!cp cleaned_original_dataset.xlsx /content/drive/MyDrive