In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Copied from Assignment 4
PADDING_WORD = "<PAD>"
UNKNOWN = "<UNK>"
def load_glove_embeddings(embedding_file,
                          padding_word=PADDING_WORD, 
                          unknown_word=UNKNOWN):
    """
    Reads Glove embeddings from a file.

    Returns vector dimensionality, the word_to_id mapping (as a dict),
    and the embeddings (as a list of lists).
    """
    # we just read in all the embeddings from the file and index the words in the order we read them
    # starting from 2 bcs we have padding as 0 and unknown as 1
    word_to_id = {}  # Dictionary to store word-to-ID mapping
    word_to_id[padding_word] = 0
    word_to_id[unknown_word] = 1
    embeddings = []
    with open(embedding_file, encoding='utf8') as f:
        for line in f:
            data = line.split()
            word = data[0]
            vec = [float(x) for x in data[1:]]
            embeddings.append(vec)
            word_to_id[word] = len(word_to_id)
    D = len(embeddings[0])

    embeddings.insert(word_to_id[padding_word], [0]*D)  # <PAD> has an embedding of just zeros
    embeddings.insert(word_to_id[unknown_word], [-1]*D)      # <UNK> has an embedding of just minus-ones

    return D, word_to_id, embeddings



In [4]:
# load the cleaned dataset
path = "data\question_pairs_cleaned.csv"
df = pd.read_csv(
    path,
    engine="python",
    quotechar='"',
    sep=",",
    names=['id','qid1','qid2','question1','question2','is_duplicate'],
    header=0,
    on_bad_lines='skip'
)
df = df.dropna(subset=['question1','question2','is_duplicate']).copy()
df['is_duplicate'] = df['is_duplicate'].astype(int)


In [5]:
D, word_to_id, embeddings = load_glove_embeddings("data\glove.6B\glove.6B.50d.txt")

In [6]:
import nltk
from nltk.corpus import stopwords
import numpy as np
sentence_2_embed = [None]*(max(df["qid2"])+1)
def avg_glove_embedding(question):
    question = question.lower()
    tokens = nltk.word_tokenize(question)
    #stop_words = set(stopwords.words('english'))
    #tokens = [t for t in tokens if t not in stop_words] # remove stopwords
    # if all tokens were stop words return the 0 vector
    if len(tokens) == 0:
        return np.zeros(D)
    avg = np.zeros(D)
    for w in tokens:
        if w in word_to_id:
            current_embedding = embeddings[word_to_id[w]]
        else:
            current_embedding = embeddings[word_to_id[UNKNOWN]] #just in case we change from the 0 vector in the future currently no effect
        avg += current_embedding
    avg/= len(tokens)
    return avg
def parse_row(row):
    avg = [np.zeros(D),np.zeros(D)] #one for question1 one for question2
    for q in [1,2]:
        #check if we already did this question, if so just get the stored one
        if sentence_2_embed[row[f"qid{q}"]] is not None:
            avg[q-1] = sentence_2_embed[row[f"qid{q}"]]
        else:
            question = row[f"question{q}"].lower()
            avg[q-1] = avg_glove_embedding(question)
            sentence_2_embed[row[f"qid{q}"]] = avg[q-1]# save the question by its id
    #avg.append(np.dot(avg[0],avg[1]))
    return avg

In [7]:
datapoints = []
labels = []
#calculates the average glove embedding of both sentences
for index, row in df.iterrows():
    avg = parse_row(row)
    datapoints.append([avg[0],avg[1]])
    labels.append(row["is_duplicate"])


In [None]:
X = np.array(datapoints)           # (n_samples, 2, D)
X_flat = X.reshape(X.shape[0], -1) # (n_samples, 2*D)

X_train, X_test, y_train, y_test = train_test_split(
    X_flat, labels, test_size=0.2, stratify=labels, random_state=42 # stratisfy means keep the same ratio of 0/1 in both train and test
)


# 7. Train a LogisticRegression on the sparse matrix
clf = LogisticRegression(
    solver='saga',        # sparse-friendly solver
    max_iter=500,
    class_weight='balanced',
    verbose=1,
    n_jobs=-1
)
clf.fit(X_train, y_train)

# 8. Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.


In [None]:
n_weights = clf.coef_.size       
n_biases  = clf.intercept_.size
n_params  = n_weights + n_biases
print(f"Total number of parameters: {n_params}")

Total number of parameters: 101


Output from our experiments:

D = 300
```
Accuracy: 0.6756425342168824
              precision    recall  f1-score   support

           0       0.78      0.67      0.72     50895
           1       0.55      0.68      0.61     29840

    accuracy                           0.68     80735
   macro avg       0.67      0.68      0.67     80735
weighted avg       0.70      0.68      0.68     80735

Confusion Matrix:
 [[34322 16573]
 [ 9614 20226]]
```

D = 50
```
Accuracy: 0.6285254226791355
              precision    recall  f1-score   support

           0       0.75      0.62      0.68     50895
           1       0.50      0.65      0.56     29840

    accuracy                           0.63     80735
   macro avg       0.62      0.63      0.62     80735
weighted avg       0.66      0.63      0.63     80735

Confusion Matrix:
 [[31 453 19 442]
 [10 549 19 291]]
```

In [None]:
print("Percentage of 1 labels: {}".format(sum(y_train)/len(y_train)))

Percentage of 1 labels: 0.36960459780080945


## demo

In [None]:
q1 = input("Enter the first question: ")
q2 = input("Enter second question: ")
avg = [np.zeros(D),np.zeros(D)]
for i,question in enumerate([q1,q2]):
    avg[i] = avg_glove_embedding(question)
avg = np.array([avg]).flatten()
avg = np.expand_dims(avg,axis=0)
prob = clf.predict_proba(avg)[0]
print(f"Predicted class 0 with prob {prob[0]} and class 1 with prob {prob[1]}")

Predicted class 0 with prob 0.8852010131033184 and class 1 with prob 0.11479898689668157
