In [None]:
%pip install nltk

# Libraries

In [1]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from pyarabic import araby

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WalidAlshamrani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\WalidAlshamrani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\WalidAlshamrani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\WalidAlshamrani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Data Loading

In [4]:
df = pd.read_csv("AAAA sports_tweets.csv")

### Handling missing values

In [5]:
# Checking if there is any missing values
df = df.replace('NaN', np.nan)
print(df.isnull().sum())

tweet    0
label    0
dtype: int64


###  Duplicate values removal

In [6]:
# Checking if there is any duplicate values and removing it
original_count = len(df)

df.drop_duplicates(subset=['tweet'], keep='first', inplace=True)

cleaned_count = len(df)
deleted_count = original_count - cleaned_count

print(f"Original number of tweets: {original_count}")
print(f"Number of duplicate tweets deleted: {cleaned_count}")
print(f"=====================================")
print(f"Number of duplicate tweets deleted: {deleted_count} ๐๏ธ")

Original number of tweets: 3044
Number of duplicate tweets deleted: 2316
Number of duplicate tweets deleted: 728 ๐๏ธ


### Removal of non-Arabic letters and special symbols

In [7]:
# Cleaning Arabic text by removing non-Arabic characters, and normalizing whitespace
def clean_arabic_text(text):
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text) # Regex to keep Arabic letters (Unicode range 0600-06FF) and spaces
    text = re.sub(r'[^\w\s]', '', text) # Remove special characters and symbols
    text = " ".join(text.split()) # Remove extra whitespaces
    return text

df['cleaned_tweet'] = df['tweet'].apply(clean_arabic_text)

###  Emojis removal

In [8]:
# Removing emojis from tweet
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", text)

df['no_emojis_tweet'] = df['cleaned_tweet'].apply(remove_emojis)

### Normalization

In [9]:
# Function to normalize different forms of Hamza and Alef
def normalize_hamza(text):
  text = text.replace(u"\u0623", u"\u0627")  # Replace Alef with Hamza above to bare Alef
  text = text.replace(u"\u0625", u"\u0627")  # Replace Alef with Hamza below to bare Alef
  text = text.replace(u"\u0624", u"\u0648")  # Replace Waw with Hamza to Waw
  text = text.replace(u"\u0626", u"\u064a")  # Replace Ya with Hamza to Ya
  text = text.replace(u"\u0671", u"\u0627")  # Replace Alef Wasla to bare Alef
  return text

# Apply normalization
df['normalized_tweet'] = df['no_emojis_tweet'].apply(normalize_hamza)

### Links removal

In [10]:
# Removing the links
def remove_links(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'&[a-z]+;', '', text) # Remove HTML entities
    text = re.sub(r"\d+", '', text) # Remove numbers
    return text

df['no_link_tweet'] = df['normalized_tweet'].apply(remove_links)

### Tokenization and stop word removal

In [11]:
# Function to tokenize text and remove Arabic stopwords
def tokenize_tweet(text):
    tokens = word_tokenize(text)
    # Load Arabic stopwords
    stop_words = set(stopwords.words('arabic'))
    # Filter out stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['tokenized_tweet'] = df['no_link_tweet'].apply(tokenize_tweet)


### Lemmatization

In [12]:
# Function to lemmatize tokens using PyArabic
def lemmatize_tweet(tokens):
    if hasattr(araby, 'is_stem'):
        # If the token is not already a stem, apply logic (simplistic check here)
        lemmatized_tokens = [token for token in tokens if not araby.is_stem(token)]
    else:
        lemmatized_tokens = tokens
    return lemmatized_tokens

df['lemmatized_tweet'] = df['tokenized_tweet'].apply(lemmatize_tweet)

### Verification and Comparison


In [13]:
print("\nExample of cleaned tweet:")
for i in range(5):
    print("Original:", df['tweet'][i])
    print("Cleaned:", df['cleaned_tweet'][i])
    print("No emojis:", df['no_emojis_tweet'][i])
    print("Normalized:", df['normalized_tweet'][i])
    print("No Link:", df['no_link_tweet'][i])
    print("Tokenized:", df['tokenized_tweet'][i])
    print("Lemmatized:", df['lemmatized_tweet'][i])
    print("-" * 20)


Example of cleaned tweet:
Original:  ุชุฎูู ุจุณ ุชุฎูู ุงุนุชูุฏ ุนูู ุบุฑูุฒูุงู ูููุฑุงุชุง ุงุซููู ูููู ููุชููู ุงูุตูุงุญูู ูู ุฒูุงู ูุงุญุฏ ูุฏู ููุณู ูุงุญุฏ ูุงูุซุงูู ุงูุชูู ูู ูู ุจุงุน ุงููุงุฏู ูุฑุฌุน ูููุณุท ูููู ูููุฑูุชู ูููู ุงูุณู ุจุงููู ุงุนุชูุฏ ุนูู ููุจ ููุง ุงุนุชูุฏ ุนููู ูููุฑูุชู ูุงุดููุง ููู ุดู ูู ุจุนุฏ ููุณู  ูุฑุฌุน ููุฏูุงุน ุฎูููุฒ ูุณุงููุชุด ุฎูููุฒ ุญุฑุงู ููุนุจ ูุฑู ูุฏ
Cleaned: ุชุฎูู ุจุณ ุชุฎูู ุงุนุชูุฏ ุนูู ุบุฑูุฒูุงู ูููุฑุงุชุง ุงุซููู ูููู ููุชููู ุงูุตูุงุญูู ูู ุฒูุงู ูุงุญุฏ ูุฏู ููุณู ูุงุญุฏ ูุงูุซุงูู ุงูุชูู ูู ูู ุจุงุน ุงููุงุฏู ูุฑุฌุน ูููุณุท ูููู ูููุฑูุชู ูููู ุงูุณู ุจุงููู ุงุนุชูุฏ ุนูู ููุจ ููุง ุงุนุชูุฏ ุนููู ูููุฑูุชู ูุงุดููุง ููู ุดู ูู ุจุนุฏ ููุณู ูุฑุฌุน ููุฏูุงุน ุฎูููุฒ ูุณุงู

### Final Output

In [14]:
# Drop intermediate processing columns to save memory and clean the view
df = df.drop(columns=['tweet', 'cleaned_tweet', 'no_emojis_tweet', 'normalized_tweet','no_link_tweet', 'tokenized_tweet'])

In [15]:
df['tweet'] = df['lemmatized_tweet'].apply(lambda x: ' '.join(x))


In [16]:
# Reorder columns for the final output
new_order = ['tweet', 'lemmatized_tweet', 'label']

df = df[new_order]
df

Unnamed: 0,tweet,lemmatized_tweet,label
0,ุชุฎูู ุชุฎูู ุงุนุชูุฏ ุนูู ุบุฑูุฒูุงู ูููุฑุงุชุง ูููู ููุชูู...,"[ุชุฎูู, ุชุฎูู, ุงุนุชูุฏ, ุนูู, ุบุฑูุฒูุงู, ูููุฑุงุชุง, ููู...",hate
1,ุจุทููุงุช ุนูุฏ ููุฏ ุจู ูุงูู ุฎูุงู ุณููุงุช ููุท ุญุชู ุงูุงู...,"[ุจุทููุงุช, ุนูุฏ, ููุฏ, ุจู, ูุงูู, ุฎูุงู, ุณููุงุช, ููุท,...",not
2,ุงููู ููุนุจูู ุฒู ุฎุฏูู ูุณุชุงูููุง ุงูุชุดููู ุงูุงุณุงุณู ุง...,"[ุงููู, ููุนุจูู, ุฒู, ุฎุฏูู, ูุณุชุงูููุง, ุงูุชุดููู, ุงู...",hate
3,ุงุฏุงุก ูุงุดู ุทุงููู ุงูููู ุญุณููุง ุจุงูุงุญุจุงุท ุงููุจุงุฑุงุฉ ...,"[ุงุฏุงุก, ูุงุดู, ุทุงููู, ุงูููู, ุญุณููุง, ุจุงูุงุญุจุงุท, ุงู...",hate
4,ุงุฏุงุก ูุงุดู ุฎุฏูู ุงูููู ุญุณููุง ุจุงูุงุญุจุงุท ุงููุจุงุฑุงุฉ ุง...,"[ุงุฏุงุก, ูุงุดู, ุฎุฏูู, ุงูููู, ุญุณููุง, ุจุงูุงุญุจุงุท, ุงูู...",hate
...,...,...,...
3037,ูุฑุฉ ุซุงููุฉ ุทุงููู ูุญุทููู ุงูุขูุงู ูุงุฒู ุญู ุฌุฐุฑู ุงูู...,"[ูุฑุฉ, ุซุงููุฉ, ุทุงููู, ูุญุทููู, ุงูุขูุงู, ูุงุฒู, ุญู, ...",hate
3039,ุงุฎุฎ ูุงูููุฑ ุฏุงูู ุงุฐุง ุฎุณุฑ ุงูููุงู ุงุจูู ูุฎูููู ุงุถุญ...,"[ุงุฎุฎ, ูุงูููุฑ, ุฏุงูู, ุงุฐุง, ุฎุณุฑ, ุงูููุงู, ุงุจูู, ูุฎ...",hate
3040,ุงูููุตูู ุจูุง ูุฑุงูุฉ ูุฎุฑุจ ุดู,"[ุงูููุตูู, ุจูุง, ูุฑุงูุฉ, ูุฎุฑุจ, ุดู]",hate
3041,ูุฑุจูุง ุงูู ุงูุฏูุฑู ุงูุงุณุชุซูุงู ููุช ูุฑุชุฏูุง ุซูุจ ุณุจุนู...,"[ูุฑุจูุง, ุงูู, ุงูุฏูุฑู, ุงูุงุณุชุซูุงู, ููุช, ูุฑุชุฏูุง, ุซ...",not


In [None]:
# Export the cleaned DataFrame to a new CSV file
df.to_csv('AAAAA sports_tweets.csv', index=False)