In [1]:
import os, sys
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir,'..')))

In [3]:
from scripts.data_load import *
import re
from scripts.store_database import *

In [4]:
df = load_data('../data/telegram_data.csv')

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
# Drop rows with missing critical data
df.dropna(subset=['Message'], inplace=True)

# Fill missing values in non-critical columns
df['Media Path'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Media Path'].fillna('Unknown', inplace=True)


In [7]:
# Convert date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Standardize text casing
df['Message'] = df['Message'].str.lower()

In [8]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Apply the function to the 'Message' column
df['Message'] = df['Message'].apply(remove_emojis)

# Display the updated DataFrame
print(df.head())

      Channel Title Channel Username   ID  \
0  Doctors Ethiopia       @DoctorsET  864   
1  Doctors Ethiopia       @DoctorsET  863   
2  Doctors Ethiopia       @DoctorsET  862   
3  Doctors Ethiopia       @DoctorsET  861   
4  Doctors Ethiopia       @DoctorsET  860   

                                             Message  \
0  https://youtu.be/5dboem-8kma?si=ldlueecnfuljvd...   
1  ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...   
2  ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...   
3  ከ hiv የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...   
4  በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( homos...   

                       Date Media Path  
0 2023-12-18 17:04:02+00:00    Unknown  
1 2023-11-03 16:14:39+00:00    Unknown  
2 2023-10-02 16:37:39+00:00    Unknown  
3 2023-09-16 07:54:32+00:00    Unknown  
4 2023-09-01 16:16:15+00:00    Unknown  


In [9]:
# Ensure no null values in critical fields
assert df['Message'].isnull().sum() == 0, "Null values found in critical_column"

# Remove duplicate IDs
df.drop_duplicates(subset=['ID'], inplace=True)

# Ensure unique identifiers are unique
assert df['ID'].duplicated().sum() == 0, "Duplicate IDs found"

In [15]:
df.to_csv('../data/cleaned_data.csv', index=False)

In [None]:
insert_data(df)