In [None]:
# emoji_sentiment_analysis\notebooks\model_training.ipynb

import sys
from pathlib import Path

# Add the project's root directory to the Python path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from emoji_sentiment_analysis.config import PROCESSED_DATA_DIR, MODELS_DIR, TEXT_COL, TARGET_COL
import re
from loguru import logger

# --- Model Training ---

# 1. Load the processed data
processed_data_path = PROCESSED_DATA_DIR / "1k_data_processed.csv"
try:
    df = pd.read_csv(processed_data_path)
    logger.info(f"Successfully loaded processed data from {processed_data_path}")
except FileNotFoundError:
    logger.error(f"File not found at {processed_data_path}. Please run the dataset script first.")
    raise

# 2. Extract emojis from text
# Use the same regex as the app to ensure consistency
EMOJI_PATTERN = re.compile(
    r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+',
    flags=re.UNICODE
)

def extract_emojis(text):
    return " ".join(re.findall(EMOJI_PATTERN, str(text)))

df['emojis'] = df[TEXT_COL].apply(extract_emojis)
df['text_and_emojis'] = df[TEXT_COL].astype(str) + " " + df['emojis']

# 3. Split data into training and testing sets
X = df['text_and_emojis']
y = df[TARGET_COL]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logger.info("Data split into training and testing sets.")

# 4. Train the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
logger.info("TF-IDF vectorizer trained successfully.")

# 5. Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
logger.info("Logistic Regression model trained successfully.")

# 6. Save the trained model and vectorizer
MODELS_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(model, MODELS_DIR / "sentiment_model.pkl")
joblib.dump(vectorizer, MODELS_DIR / "tfidf_vectorizer.pkl")
logger.success(f"Trained model and vectorizer saved to {MODELS_DIR}")