# 1.0 - EDA

Initial exploratory data analysis notebook. Run cells sequentially.

In [None]:
# Cell 1 - imports & setup
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DATA_RAW = Path('data/raw')
DATA_INTERIM = Path('data/interim')
DATA_INTERIM.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_columns', 80)
print("Setup complete. DATA_RAW:", DATA_RAW.resolve())

In [None]:
# Cell 2 - helper functions
import re
from collections import Counter

def basic_clean_text(text):
    if pd.isna(text):
        return ''
    text = str(text)
    # remove urls
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Helper functions loaded.")

In [None]:
# Cell 3 - load dataset list & initial inspection
files = list(DATA_RAW.glob('**/*.csv'))
print('Found raw files:', len(files))
for f in files:
    print('-', f)

# build a quick map
example_paths = {f.stem: f for f in files}
print("example keys:", list(example_paths.keys())[:10])

In [None]:
# Cell 4 - load a dataset (BharatFakeNewsKosh example)
bf_file = [f for f in files if 'Bharat' in f.name or 'bharat' in f.name]
if bf_file:
    df_bharat = pd.read_csv(bf_file[0])
else:
    # try default path
    p = DATA_RAW / 'BharatFakeNewsKosh.csv'
    if p.exists():
        df_bharat = pd.read_csv(p)
    else:
        df_bharat = pd.DataFrame()
print('shape:', df_bharat.shape)
df_bharat.head()

In [None]:
# Cell 5 - quick checks - missing values, label distribution
if df_bharat.shape[0] == 0:
    print("No bharat dataset loaded; skip checks.")
else:
    for col in df_bharat.columns:
        print(col, '->', df_bharat[col].isna().mean())

    label_col = None
    for c in ['label','class','label_text','target']:
        if c in df_bharat.columns:
            label_col = c
            break

    print('label_col', label_col)
    if label_col:
        print(df_bharat[label_col].value_counts(dropna=False))

In [None]:
# Cell 6 - text length distribution and examples
if df_bharat.shape[0] > 0:
    text_cols = [c for c in df_bharat.columns if 'text' in c.lower() or 'content' in c.lower() or 'title' in c.lower()]
    print('text columns:', text_cols)
    sample_col = text_cols[0] if text_cols else df_bharat.columns[0]
    df_bharat['clean_text'] = df_bharat[sample_col].astype(str).map(basic_clean_text)
    df_bharat['len'] = df_bharat['clean_text'].str.split().map(lambda x: len(x) if isinstance(x, list) else 0)
    print(df_bharat['len'].describe())
    plt.figure(figsize=(8,4)); plt.hist(df_bharat['len'].clip(upper=500), bins=50); plt.title('text length (capped at 500)'); plt.show()
else:
    print("No data to analyze.")

In [None]:
# Cell 7 - duplicates & save sample processed file
if df_bharat.shape[0] > 0:
    print('duplicates:', df_bharat.duplicated(subset=[sample_col]).mean())
    out_path = DATA_INTERIM / 'bharat_sample_clean.csv'
    cols = ['clean_text', 'len']
    if 'label_col' in locals() and label_col:
        cols.append(label_col)
    df_bharat[cols].to_csv(out_path, index=False)
    print('wrote', out_path)
else:
    print("No data to save.")

## EDA Notes

- Columns found: ...
- Missing values: ...
- Label distribution: ...
- Next steps: combine datasets, standardize label names, generate balanced train/test splits.