# YouTube Sentiment Analyzer — Full Workflow
This notebook performs EDA, training, evaluation, and model export.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import joblib

plt.rcParams.update({'figure.figsize': (8,5)})
df = pd.read_csv('../../data/youtube/comments.csv')
df.head()

## Class Distribution

In [None]:
df['label'].value_counts().plot(kind='bar')
plt.title('Sentiment Class Distribution'); plt.xlabel('label'); plt.ylabel('count'); plt.show()

## Length Histogram

In [None]:
df['len'] = df['text'].astype(str).str.len()
df['len'].plot(kind='hist', bins=30)
plt.title('Comment Length Distribution'); plt.xlabel('length'); plt.ylabel('freq'); plt.show()

## Train/Test Split & Model

In [None]:
X = df['text'].astype(str)
y = df['label'].astype('category')
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

pipe = Pipeline([('tfidf', TfidfVectorizer(max_features=30000, ngram_range=(1,2))),
                 ('clf', LinearSVC())])
grid = GridSearchCV(pipe,
                    {'tfidf__max_df':[0.9,1.0], 'tfidf__min_df':[1,3], 'clf__C':[0.5,1.0,2.0]},
                    scoring='f1_macro', cv=3, n_jobs=-1)
grid.fit(Xtr, ytr)
pred = grid.best_estimator_.predict(Xte)
print('Best params:', grid.best_params_)
print(classification_report(yte, pred))
print(confusion_matrix(yte, pred))

## Export Model

In [None]:
joblib.dump(grid.best_estimator_, '../../models/youtube/model.joblib')
print('Saved ../../models/youtube/model.joblib')