## Data Cleaning and Transformation

* Data Cleaning

In [1]:
import pandas as pd
import logging

# Configure logging
logging.basicConfig(filename='cleaning.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Load raw data
data = pd.read_json('scraped_data.json')

# Remove duplicates
data.drop_duplicates(inplace=True)

# Handle missing values
data.fillna(method='ffill', inplace=True)

# Standardize formats (example: converting all text to lowercase)
data['message'] = data['message'].str.lower()

# Data validation (example: ensuring no empty messages)
data = data[data['message'].str.strip() != '']

# Save cleaned data
data.to_json('cleaned_data.json', orient='records')

logging.info('Data cleaning completed successfully.')

  data.fillna(method='ffill', inplace=True)


* Store Cleaned Data

In [3]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('telegramscrap.db')
c = conn.cursor()

# Create table
c.execute('''
CREATE TABLE IF NOT EXISTS messages (
    id INTEGER PRIMARY KEY,
    channel TEXT,
    message TEXT
)
''')

# Insert cleaned data
data = pd.read_json('cleaned_data.json')
data.to_sql('messages', conn, if_exists='replace', index=False)

conn.commit()
conn.close()

logging.info('Cleaned data stored in database successfully.')