In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

path = '/content/drive/My Drive/DD2417/question_pairs_cleaned.csv'
# 1. Load & clean data (assume already cleaned on disk)
df = pd.read_csv(
    path,
    engine="python",
    quotechar='"',
    sep=",",
    names=['id','qid1','qid2','question1','question2','is_duplicate'],
    header=0,
    on_bad_lines='skip'
)
df = df.dropna(subset=['question1','question2','is_duplicate']).copy()
df['is_duplicate'] = df['is_duplicate'].astype(int)

# 2. Build & fit TF-IDF on all questions
all_q = pd.concat([df['question1'], df['question2']])
tfidf = TfidfVectorizer(lowercase=True, stop_words='english',
                        max_features=10000, ngram_range=(1,2))
tfidf.fit(all_q)

# 3. Transform each side into sparse TF-IDF
X1 = tfidf.transform(df['question1'])
X2 = tfidf.transform(df['question2'])

# 4. Row-wise dot-product as a similarity feature
#    This is effectively the numerator of cosine_similarity,
#    and stays sparse, size = (n_samples,1)
sim = X1.multiply(X2).sum(axis=1)
# convert the column vector into a CSR sparse matrix
sim_sparse = csr_matrix(sim)

# 5. Stack everything into one big sparse feature matrix
#    [ TF-IDF(q1) | TF-IDF(q2) | similarity ]
X = hstack([X1, X2, sim_sparse])

# 6. Split into train & test
y = df['is_duplicate'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 7. Train a LogisticRegression on the sparse matrix
clf = LogisticRegression(
    solver='saga',        # sparse-friendly solver
    max_iter=500,
    class_weight='balanced',
    verbose=1,
    n_jobs=-1
)
clf.fit(X_train, y_train)

# 8. Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 24 epochs took 8 seconds
Accuracy: 0.7710410602588716
              precision    recall  f1-score   support

           0       0.85      0.77      0.81     50895
           1       0.66      0.77      0.71     29840

    accuracy                           0.77     80735
   macro avg       0.76      0.77      0.76     80735
weighted avg       0.78      0.77      0.77     80735

Confusion Matrix:
 [[39310 11585]
 [ 6900 22940]]


In [3]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = cross_val_score(clf, X, y, cv=cv, scoring='f1')
print(f"\n5-fold CV F1 scores: {f1_scores}")
print("Mean F1 score:", f1_scores.mean())

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 24 epochs took 12 seconds


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 24 epochs took 10 seconds


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 26 epochs took 9 seconds


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 24 epochs took 8 seconds


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 23 epochs took 8 seconds

5-fold CV F1 scores: [0.7104657  0.71266196 0.70931538 0.71505501 0.71092562]
Mean F1 score: 0.711684733151533
