### Load Data

In [24]:
import pandas as pd
import re
import emoji

# Load raw data from CSV
df = pd.read_csv('../data/raw/messages.csv')


In [25]:
df.head()

Unnamed: 0,message_id,sender_id,message_text,channel,date
0,67881c6f-1ed4-4c2f-aed7-1e37d4d13bfe,-1001102021238,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,DoctorsET,2023-12-18 17:04:02+00:00
1,5df99ca4-a74a-43b0-a2db-8b8a9989d594,-1001102021238,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,DoctorsET,2023-11-03 16:14:39+00:00
2,ee3ca8d8-7494-4676-a207-f3aab446fa81,-1001102021238,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,DoctorsET,2023-10-02 16:37:39+00:00
3,93443f80-59fb-416e-8da6-06c9b9a7d78a,-1001102021238,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,DoctorsET,2023-09-16 07:54:32+00:00
4,d812334d-6306-459e-b451-d90e065dc33d,-1001102021238,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,DoctorsET,2023-09-01 16:16:15+00:00


In [26]:
# Function to remove punctuation
def remove_punctuation(text):
    # Remove punctuation using regex
    return re.sub(r'[^\w\s]', '', text)

# Function to remove emojis
def remove_emojis(text):
    # Use emoji library to replace emojis with an empty string
    return emoji.replace_emoji(text, replace='')

# Function to clean message text
def clean_message_text(text):
    if not isinstance(text, str):
        return ''  # If the text is not a string (e.g., NaN or float), return an empty string
    text = remove_emojis(text)  # First, remove emojis
    # text = remove_punctuation(text)  # Then, remove punctuation
    return text

### Removing Duplicates

In [27]:
df.drop_duplicates(subset='message_id', inplace=True)


### Standardizing Formats

In [28]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Convert to datetime


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2527 entries, 0 to 2526
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   message_id    2527 non-null   object             
 1   sender_id     2527 non-null   int64              
 2   message_text  2377 non-null   object             
 3   channel       2527 non-null   object             
 4   date          2527 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(1), object(3)
memory usage: 98.8+ KB


### Data Validation

In [30]:
# Validate that message_id is unique
assert df['message_id'].is_unique, "Duplicate message IDs found!"

# Validate that dates are not in the future
# Convert pd.Timestamp.now() to UTC to match df['date']
now_utc = pd.Timestamp.now(tz='UTC')

# Filter rows where 'date' is less than or equal to the current timestamp
df = df[df['date'] <= now_utc]



In [31]:
# Apply cleaning to message_text column
df['message_text'] = df['message_text'].apply(clean_message_text)

In [32]:
df.to_csv('../data/cleaned/cleaned_data.csv', index=False)
