In [2]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import spacy
import pickle

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Use pipeline_utils 
from pipeline_utils import model_pipeline


# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("data/reviews.csv")

# サンプル数を減らす場合
df = df.sample(n=500, random_state=27).reset_index(drop=True)

X = df.drop('Recommended IND', axis=1)
y = df['Recommended IND']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=27
)

# -----------------------------
# Column types
# -----------------------------
numeric_features = ['Age', 'Positive Feedback Count']
categorical_features = ['Clothing ID', 'Division Name', 'Department Name', 'Class Name']
text_features = ['Title', 'Review Text']

# -----------------------------
# spaCy model
# -----------------------------
nlp = spacy.load("en_core_web_sm")

# -----------------------------
# Build pipeline (from pipeline_utils)
# -----------------------------
pipeline = model_pipeline(
    numeric_features=numeric_features,
    categorical_features=categorical_features,
    text_features=text_features,
    nlp=nlp
)

# -----------------------------
# Hyperparameter tuning
# -----------------------------
param_distributions = {
    'sgd__alpha': [0.0001, 0.001, 0.01, 0.1],
    'sgd__tol': [1e-4, 1e-3, 1e-2],
    'sgd__loss': ['hinge', 'squared_hinge']
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    cv=3,
    n_jobs=-1,
    verbose=2,
    refit=True,
    random_state=27
)

random_search.fit(X_train, y_train)

best_pipeline = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

# -----------------------------
# Evaluation
# -----------------------------
y_pred = best_pipeline.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))

# -----------------------------
# Save pipeline for FastAPI
# -----------------------------
with open("trained_pipeline.pkl", "wb") as f:
    pickle.dump(best_pipeline, f)

print("Pipeline saved to trained_pipeline.pkl")


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters: {'sgd__tol': 0.001, 'sgd__loss': 'hinge', 'sgd__alpha': 0.001}
Accuracy : 0.86
Precision: 0.8775510204081632
Recall   : 0.9772727272727273
F1 Score : 0.9247311827956989
Pipeline saved to trained_pipeline.pkl
