**Baseline model**

Extracts only basic features (jaccard similarity)

Trains a Logistic Regression model on this basic features

In [1]:
import os
import joblib
from utils import (
    load_and_split_data,
    extract_basic_features,
    train_logistic_model
)

import spacy
from utils import nlp as utils_nlp

# Ensuring SpaCy model is loaded and assigned to utils
# as it was not possible to include it in environment.ysl
try:
    nlp = spacy.load("en_core_web_md")
except OSError:
    print("Downloading SpaCy model 'en_core_web_md'...")
    from spacy.cli import download
    download("en_core_web_md")
    nlp = spacy.load("en_core_web_md")

# Assign to utils
import utils
utils.nlp = nlp


#Load and Split data

train_df, val_df, test_df = load_and_split_data()


# Feature Extraction

X_train = extract_basic_features(train_df)
X_val = extract_basic_features(val_df)

y_train = train_df['is_duplicate']
y_val = val_df['is_duplicate']


# Train Model

model = train_logistic_model(X_train, y_train)


# Save Model

os.makedirs("models", exist_ok=True)
model_path = "models/logistic_basic.pkl"
if not os.path.exists(model_path):
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already exists at {model_path} , not saving")


Downloading SpaCy model 'en_core_web_md'...
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Model saved to models/logistic_basic.pkl


------

**Improved baseline model**

Uses the full extract_improved_features() with all 7 features: 
jaccard, len_diff, tfidf_cosine, levenshtein, shared_bigrams, avg_word_len_diff, spacy_cosine

Trains another Logistic Regression model, but now including the full feature set

In [2]:
import os
import joblib
from utils import (
    load_and_split_data,
    extract_improved_features,
    train_logistic_model
)


#Load and Split data

train_df, val_df, test_df = load_and_split_data()


# Extract Improved Features

X_train, tfidf_vectorizer = extract_improved_features(train_df)
y_train = train_df['is_duplicate']

X_val, _ = extract_improved_features(val_df, tfidf_vectorizer)
y_val = val_df['is_duplicate']


# Train Model

model = train_logistic_model(X_train, y_train)


# Save Model and TF-IDF Vectorizer

os.makedirs("models", exist_ok=True)

model_path = "models/logistic_improved.pkl"
vectorizer_path = "models/tfidf_vectorizer.pkl"

if not os.path.exists(model_path):
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")
else:
    print(f"Model already exists at {model_path}..not saving.")

if not os.path.exists(vectorizer_path):
    joblib.dump(tfidf_vectorizer, vectorizer_path)
    print(f"TF-IDF Vectorizer saved to {vectorizer_path}")
else:
    print(f"Vectorizer already exists at {vectorizer_path}..not saving.")


Model saved to models/logistic_improved.pkl
TF-IDF Vectorizer saved to models/tfidf_vectorizer.pkl


---------
**Grid Search for Best Models (Logistic + Random Forest)**
 
Extracts the same 7 improved features

Defines hyperparameter grids for: 
- LogisticRegression (C=[0.1, 1, 10])
- RandomForestClassifier (n_estimators= [50, 100], max_depth=[None, 10])

Uses GridSearchCV with 3-fold CV to find the best hyperparameters for each model (based on ROC AUC)

Saves both best models and vectorizer

In [3]:
import os
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from utils import (
    load_and_split_data,
    extract_improved_features
)


# Load Data

train_df, val_df, test_df = load_and_split_data()
X_train, tfidf_vectorizer = extract_improved_features(train_df)
y_train = train_df['is_duplicate']


# Define Classifiers and Grid

models = {
    "logistic": {
        "model": LogisticRegression(max_iter=200),
        "params": {
            "classifier__C": [0.1, 1, 10]
        }
    },
    "random_forest": {
        "model": RandomForestClassifier(),
        "params": {
            "classifier__n_estimators": [50, 100],
            "classifier__max_depth": [None, 10]
        }
    }
}


# Grid search

best_models = {}

for name, entry in models.items():
    print(f"\nSearching for best {name} model.")

    pipe = Pipeline([
        ("classifier", entry["model"])
    ])

    grid = GridSearchCV(pipe, entry["params"], cv=3, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train, y_train)

    print(f"Best AUC for {name}: {grid.best_score_:.4f}")
    print(f"Best params: {grid.best_params_}")
    best_models[name] = grid.best_estimator_


#save best models

os.makedirs("models", exist_ok=True)

for name, model in best_models.items():
    model_path = f"models/{name}_model.pkl"
    joblib.dump(model, model_path)
    print(f"Saved {name} model to {model_path}")

# Save TF-IDF vectorizer once
vectorizer_path = "models/tfidf_vectorizer.pkl"
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"Saved TF-IDF vectorizer to {vectorizer_path}")



Searching for best logistic model.
Best AUC for logistic: 0.7763
Best params: {'classifier__C': 10}

Searching for best random_forest model.
Best AUC for random_forest: 0.8174
Best params: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Saved logistic model to models/logistic_model.pkl
Saved random_forest model to models/random_forest_model.pkl
Saved TF-IDF vectorizer to models/tfidf_vectorizer.pkl
