In [87]:
import pandas as pd
from googletrans import Translator
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
import pyarabic.araby as araby


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
# Read the dataset
data = pd.read_excel('datasets/2018-E-c-Ar-train.xlsx')


In [89]:
data_raw = data.copy()

In [90]:

# Regular expression pattern to match any non-Arabic characters
english_pattern = r'\b[A-Za-z]+\b'

# Regular expression pattern to match any digits
digits_pattern = r'\d+'

# Define a list of Arabic stop words to remove
stop_words = set(stopwords.words('arabic'))


In [91]:
def count_english_words(data):
    # Tokenize the text into words
    tokens = data['Tweet'].apply(word_tokenize)

    # Filter out non-Arabic words using the regular expression
    english_words = tokens.apply(lambda words: [word for word in words if re.match(english_pattern, word)])

    # Count the occurrences of each Arabic word
    english_words = pd.Series([word for words in english_words for word in words]).value_counts()

    # Print the top 10 most frequent Arabic words
    print(english_words.head(10))



In [92]:
count_english_words(data)

AkramHosny         5
feeling            3
nadinenjeim        2
mshinqiti          2
D                  2
lt                 2
nowaaaash          1
Idgix              1
Nadaabdulwahab     1
AsmaAlmuhannadi    1
dtype: int64


In [93]:
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub(english_pattern, '', x))

In [94]:
count_english_words(data)

Series([], dtype: int64)


  english_words = pd.Series([word for words in english_words for word in words]).value_counts()


In [95]:
def count_digits(data):
    # Apply the regular expression pattern to each row of the "tweet" column and count the digits
    digit_counts = data['Tweet'].apply(lambda x: len(re.findall(digits_pattern, x))).sum()

    # Print the total count of digits in all tweets
    print('Total digits count:', digit_counts)


In [96]:
count_digits(data)

Total digits count: 331


In [97]:
# Apply the regular expression pattern to each row of the "tweet" column
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub(digits_pattern, '', x))

In [98]:
count_digits(data)

Total digits count: 0


In [99]:
data.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2018-Ar-00259,ظلينا نتكلم ساعات ساعات رتبت فيها نفسي وبكيت ف...,1,0,0,1,0,0,0,1,1,0,0
1,2018-Ar-02696,كل سنه وانتي بخير ياقلبي وكل سنه وانتي سعيده ي...,0,0,0,0,1,1,1,0,0,0,0
2,2018-Ar-03596,البسطاء يمتلكون أرواح نادره جداتجدهم بمظهر متو...,0,0,0,0,0,1,1,0,0,0,0
3,2018-Ar-02999,مومعقول اللي قاعد يصير فيني هالايام يارب ماينت...,0,0,0,1,0,0,0,0,0,0,0
4,2018-Ar-02716,انا اكثر شخص متناقض بداخلي حب وكره وامل وقنوط ...,1,0,0,0,0,0,0,0,1,0,0


In [100]:
# Define a function to remove stop words from each tweet
def remove_stop_words(tweet):
    tokens = word_tokenize(tweet)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

In [101]:
# Apply the function to each row of the "tweet" column
data['Tweet'] = data['Tweet'].apply(remove_stop_words)

In [102]:
# Define a function to removes Arabic Diacritics.
def remove_diacritics(tweet):
    return araby.strip_tashkeel(tweet)

In [103]:
# Apply the function to each row of the "tweet" column
data['Tweet'] = data['Tweet'].apply(remove_diacritics)

In [104]:
# Define a function to normalize Arabic text
def normalize_arabic(tweet):
    # Normalize hamza and alef characters
    tweet = araby.normalize_hamza(tweet)
    tweet = araby.normalize_alef(tweet)
    
    return tweet

In [105]:
# Apply the normalization function to each row of the "tweet" column
data['Tweet'] = data['Tweet'].apply(normalize_arabic)

In [106]:
def remove_emojis(text):
    # Match all Unicode emojis and remove them
    pattern = re.compile("[\U0001F600-\U0001F64F\u2600-\u26FF\u2700-\u27BF]+")
    return pattern.sub('', text)

In [107]:
# Apply the normalization function to each row of the "tweet" column
data['Tweet'] = data['Tweet'].apply(remove_emojis)

In [None]:
def remove_non_arabic(text):
    # Match any character that is not in the Arabic Unicode block or is not a space character
    pattern = re.compile('[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDCF\uFDF0-\uFDFF\uFE70-\uFEFF ]+')
    return pattern.sub('', text)

In [86]:
# Remove non-Arabic characters from the "tweet" column
data['Tweet'] = data['Tweet'].apply(remove_non_arabic)

In [84]:
# save the DataFrame to an xlsx file
data.to_excel('processed_data.xlsx', index=False)