## TFIDF + LR Baseline

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

def run_tfidf_lr(pair_type, field1, field2,
                 max_features=50000, ngram_range=(1,2)):
    """
    pair_type: folder name (title-title-pair, body-body-pair, post-post-pair)
    field1, field2: field names ("title1/title2", "body1/body2", "post1/post2")
    """

    print(f"\n Running TF-IDF + LR on: {pair_type}")

    # --------------------------
    # Load data
    # --------------------------
    root = '/content/drive/MyDrive/266NoteBooks/FinalProject/Data/'

    train = pd.read_json(f"{root}{pair_type}/train.jsonl", lines=True)
    val   = pd.read_json(f"{root}{pair_type}/val.jsonl", lines=True)
    test  = pd.read_json(f"{root}{pair_type}/test.jsonl", lines=True)

    print("Train:", train.shape, "Val:", val.shape, "Test:", test.shape)

    # --------------------------
    # Combine text: field1 + field2
    # --------------------------
    train["text"] = train[field1] + " [SEP] " + train[field2]
    val["text"]   = val[field1] + " [SEP] " + val[field2]
    test["text"]  = test[field1] + " [SEP] " + test[field2]

    # --------------------------
    # TF-IDF vectorizer
    # --------------------------
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
    )

    X_train = vectorizer.fit_transform(train["text"])
    X_val   = vectorizer.transform(val["text"])
    X_test  = vectorizer.transform(test["text"])

    y_train = train["label"].values
    y_val   = val["label"].values
    y_test  = test["label"].values

    # --------------------------
    # LR Classifier
    # --------------------------
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # --------------------------
    # Evaluate
    # --------------------------
    val_pred = clf.predict(X_val)
    test_pred = clf.predict(X_test)

    metrics = {
        "val_acc": accuracy_score(y_val, val_pred),
        "val_f1": f1_score(y_val, val_pred),
        "test_acc": accuracy_score(y_test, test_pred),
        "test_f1": f1_score(y_test, test_pred),
    }

    print("\nValidation Accuracy:", metrics["val_acc"])
    print("Validation F1:", metrics["val_f1"])
    print("\nTest Accuracy:", metrics["test_acc"])
    print("Test F1:", metrics["test_f1"])

    # Clean report
    print("\nClassification Report:")
    print(classification_report(y_test, test_pred))

    return metrics


In [None]:
run_tfidf_lr("title-title-pair", "title1", "title2")


 Running TF-IDF + LR on: title-title-pair
Train: (489640, 3) Val: (58976, 3) Test: (59660, 3)

Validation Accuracy: 0.6987927292457949
Validation F1: 0.668173497216722

Test Accuracy: 0.6962286288970835
Test F1: 0.6665317312823155

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.79      0.72     29830
           1       0.74      0.61      0.67     29830

    accuracy                           0.70     59660
   macro avg       0.70      0.70      0.69     59660
weighted avg       0.70      0.70      0.69     59660



{'val_acc': 0.6987927292457949,
 'val_f1': 0.668173497216722,
 'test_acc': 0.6962286288970835,
 'test_f1': 0.6665317312823155}

In [None]:
run_tfidf_lr("body-body-pair", "body1", "body2")



 Running TF-IDF + LR on: body-body-pair
Train: (402588, 3) Val: (48390, 3) Test: (49514, 3)

Validation Accuracy: 0.6345112626575738
Validation F1: 0.6073791235625805

Test Accuracy: 0.6426061315991437
Test F1: 0.6158221527506405

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.71      0.67     24757
           1       0.67      0.57      0.62     24757

    accuracy                           0.64     49514
   macro avg       0.65      0.64      0.64     49514
weighted avg       0.65      0.64      0.64     49514



{'val_acc': 0.6345112626575738,
 'val_f1': 0.6073791235625805,
 'test_acc': 0.6426061315991437,
 'test_f1': 0.6158221527506405}

In [None]:
run_tfidf_lr("post-post-pair", "post1", "post2")



 Running TF-IDF + LR on: post-post-pair
Train: (402276, 3) Val: (46504, 3) Test: (52104, 3)

Validation Accuracy: 0.6563306382246689
Validation F1: 0.6355800802626779

Test Accuracy: 0.6640756947643175
Test F1: 0.6482728131342563

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.71      0.68     26052
           1       0.68      0.62      0.65     26052

    accuracy                           0.66     52104
   macro avg       0.67      0.66      0.66     52104
weighted avg       0.67      0.66      0.66     52104



{'val_acc': 0.6563306382246689,
 'val_f1': 0.6355800802626779,
 'test_acc': 0.6640756947643175,
 'test_f1': 0.6482728131342563}