In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, classification_report, confusion_matrix)
import pickle
import warnings
from datetime import datetime
import os

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

%matplotlib inline

print("✅ All libraries imported successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✅ All libraries imported successfully!
Timestamp: 2026-01-29 15:54:30


In [2]:
data_path = '../data/processed/preprocessed_data.csv'

df = pd.read_csv(data_path)

print("="*60)
print("DATA LOADED SUCCESSFULLY")
print("="*60)
print(f"Total Records: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")

print(f"\nTarget Distribution:")
print(df['health_category'].value_counts())

print(f"\nSample data:")
print(df[['cleaned_text', 'health_category']].head(3))

DATA LOADED SUCCESSFULLY
Total Records: 19,060
Columns: ['id', 'title', 'text', 'cleaned_title', 'cleaned_text', 'combined_text', 'health_category', 'subreddit', 'score', 'num_comments', 'created_utc', 'cleaned_word_count', 'text_cleaned_length', 'avg_word_length', 'sentence_count', 'has_question', 'has_exclamation', 'uppercase_ratio', 'lexical_diversity']

Target Distribution:
health_category
fitness              2892
diet                 2874
sleep                2766
substance_use        2763
mental_health        2715
diabetes             2531
weight_management    2519
Name: count, dtype: int64

Sample data:
                                        cleaned_text health_category
0  bad diabetic culminated day ago avoiding insul...        diabetes
1  curious movie buff sub movie showcase referenc...        diabetes
2  caring someone fragile skin slightest bump res...        diabetes


In [3]:
print("="*60)
print("PREPARING FEATURES AND TARGET")
print("="*60)

X = df['combined_text'].values
y = df['health_category'].values

print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")

print(f"\nTarget classes: {np.unique(y)}")
print(f"Number of classes: {len(np.unique(y))}")

print(f"\nSample texts (first 3):")
for i in range(3):
    print(f"\n{i+1}. Category: {y[i]}")
    print(f"   Text: {X[i][:150]}...")

PREPARING FEATURES AND TARGET
Feature shape: (19060,)
Target shape: (19060,)

Target classes: ['diabetes' 'diet' 'fitness' 'mental_health' 'sleep' 'substance_use'
 'weight_management']
Number of classes: 7

Sample texts (first 3):

1. Category: diabetes
   Text: diabetes wound bad diabetic culminated day ago avoiding insulin took metformin jardiance even take correctly well got ingrown hair lower buttcheek mid...

2. Category: diabetes
   Text: movie curious movie buff sub movie showcase reference people noticed even better impression well explained diabetes relevance lot time find done resea...

3. Category: diabetes
   Text: patch caring someone fragile skin slightest bump result bruise friction tear skin looking patch cgm tear skin remove suggestion...
