In [None]:
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

# Load all raw JSON files
json_files = glob("data/raw/**/*.json", recursive=True)

all_messages = []

for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        messages = json.load(f)
        for msg in messages:
            all_messages.append({
                "channel": msg.get("channel"),
                "date": msg.get("date"),
                "text": msg.get("message", ""),
                "has_media": bool(msg.get("media")),
            })

df = pd.DataFrame(all_messages)

# Convert dates
df['date'] = pd.to_datetime(df['date'])

# Analysis
df['text_length'] = df['text'].str.len()

# Plot message count by channel
plt.figure(figsize=(10,6))
sns.countplot(data=df, y='channel', order=df['channel'].value_counts().index)
plt.title("Message Count by Channel")
plt.show()

# Save for downstream
df.to_csv("data/processed/messages_flat.csv", index=False)
