In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib
import os

# Setup
LABELS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
os.makedirs("models", exist_ok=True)

# Load your data
df = pd.read_csv("../data/train.csv")
X = df["comment_text"]
y = df[LABELS]

# Fit TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

# Fit Logistic Regression models
logreg_models = {}
for label in LABELS:
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_tfidf, y[label])
    logreg_models[label] = clf

# Save the fitted objects
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
joblib.dump(logreg_models, "models/logreg_model.pkl")


['models/logreg_model.pkl']