In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
!pip3 install gensim -q
!pip3 install datasets



In [28]:
from datasets import load_dataset
ds = load_dataset("PiC/phrase_similarity")

In [29]:
ds

DatasetDict({
    train: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 7004
    })
    validation: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['phrase1', 'phrase2', 'sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2000
    })
})

In [30]:
train_data = []
train_labels = ds['train']['label']

val_data = []
val_labels = ds['validation']['label']

test_data = []
test_labels = ds['test']['label']

phrase_1 = ds['train']['phrase1']
phrase_2 = ds['train']['phrase2']

val_phrase1 = ds['validation']['phrase1']
val_phrase2 = ds['validation']['phrase2']

test_phrase1 = ds['test']['phrase1']
test_phrase2 = ds['test']['phrase2']

for i in range(len(phrase_1)):
    train_data.append([phrase_1[i], phrase_2[i]])

for i in range(len(val_phrase1)):
    val_data.append([val_phrase1[i], val_phrase2[i]])

for i in range(len(test_phrase1)):
    test_data.append([test_phrase1[i], test_phrase2[i]])

print(len(phrase_1), len(phrase_2), len(train_data))

7004 7004 7004


In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def filtering_stopwords(data):
    for i in range(len(data)):
        words_1 = word_tokenize(data[i][0])
        words_2 = word_tokenize(data[i][1])
        filtered_1 = [w for w in words_1 if not w.lower() in stop_words]
        filtered_2 = [w for w in words_2 if not w.lower() in stop_words]
        data[i]=[filtered_1,filtered_2]
    return data

train_data=filtering_stopwords(train_data)
val_data=filtering_stopwords(val_data)
test_data=filtering_stopwords(test_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
train_data[0]

[['newly', 'formed', 'camp'], ['recently', 'made', 'encampment']]

In [33]:
val_data[0]

[['first', 'colony'], ['original', 'settlement']]

In [34]:
test_data[0]

[['air', 'position'], ['posture', 'jumping']]

In [35]:
import gensim
fasttext_model_path = "/content/drive/MyDrive/FASTTEXT_embedding_model/wiki-news-300d-1M.vec"
embedding_model = gensim.models.KeyedVectors.load_word2vec_format(fasttext_model_path)

In [36]:
import numpy as np
def generate_embeddings(data, model):
    embeddings = []

    for i in range(len(data)):
        phrase_1 = data[i][0]
        phrase_2 = data[i][1]

        phrase_1_embeddings = []
        for word in phrase_1:
            if word not in model:
              phrase_1_embeddings.append(np.zeros(model.vector_size))
            else:
                phrase_1_embeddings.append(model[word])

        phrase_2_embeddings = []
        for word in phrase_2:
            if word not in model:
                phrase_1_embeddings.append(np.zeros(model.vector_size))
            else:
                phrase_2_embeddings.append(model[word])

        embeddings.append([phrase_1_embeddings, phrase_2_embeddings])

    return embeddings

train_embeddings = generate_embeddings(train_data, embedding_model)
val_embeddings = generate_embeddings(val_data, embedding_model)
test_embeddings = generate_embeddings(test_data, embedding_model)

print(len(train_embeddings),len(train_labels))
print(len(val_embeddings),len(val_embeddings))
print(len(test_embeddings),len(test_labels))

7004 7004
1000 1000
2000 2000


In [37]:
type(train_embeddings[0][0][0])

numpy.ndarray

In [38]:
import numpy as np

def calculate_average_embeddings(embeddings, labels):
    average_embeddings = []
    filtered_labels = []

    for i in range(len(embeddings)):
        phrase1 = embeddings[i][0]
        phrase2 = embeddings[i][1]

        phrase1_average = np.zeros(300,)
        phrase2_average = np.zeros(300,)

        if len(phrase1) > 0:
            for j in range(len(phrase1)):
                phrase1_average += phrase1[j]
            phrase1_average /= len(phrase1)
        else:
            continue

        if len(phrase2) > 0:
            for j in range(len(phrase2)):
                phrase2_average += phrase2[j]
            phrase2_average /= len(phrase2)
        else:
            continue

        filtered_labels.append(labels[i])
        average_embeddings.append([phrase1_average, phrase2_average])

    return average_embeddings, filtered_labels

average_train_embeddings, average_train_labels = calculate_average_embeddings(train_embeddings, train_labels)
average_val_embeddings, average_val_labels = calculate_average_embeddings(val_embeddings, val_labels)
average_test_embeddings, average_test_labels = calculate_average_embeddings(test_embeddings, test_labels)

In [39]:
len(train_embeddings[0])

2

In [40]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def flatten_embeddings(average_embeddings):
    flattened_embeddings = []
    for pair in average_embeddings:
        flattened_embeddings.append(np.concatenate(pair))
    return flattened_embeddings

train_features = flatten_embeddings(average_train_embeddings)
val_features = flatten_embeddings(average_val_embeddings)
test_features = flatten_embeddings(average_test_embeddings)

svm_classifier = SVC(max_iter=1000)
svm_classifier.fit(train_features, average_train_labels)

val_predictions = svm_classifier.predict(val_features)
test_predictions = svm_classifier.predict(test_features)

val_accuracy = accuracy_score(average_val_labels, val_predictions)
test_accuracy = accuracy_score(average_test_labels, test_predictions)
print("Validation Accuracy: ", val_accuracy)
print("Test Accuracy: ", test_accuracy)



Validation Accuracy:  0.41382765531062127
Test Accuracy:  0.4294294294294294


In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_features, average_train_labels)

val_predictions = rf_model.predict(val_features)
val_accuracy = accuracy_score(average_val_labels, val_predictions)
print('Validation Accuracy with Random Forest: ', val_accuracy)

test_predictions = rf_model.predict(test_features)
test_accuracy = accuracy_score(average_test_labels, test_predictions)
print('Test Accuracy with Random Forest: ', test_accuracy)

Validation Accuracy with Random Forest:  0.14729458917835672
Test Accuracy with Random Forest:  0.16016016016016016


In [42]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(train_features, average_train_labels)

val_predictions = lr_model.predict(val_features)
val_accuracy = accuracy_score(average_val_labels, val_predictions)
print('Validation Accuracy with Logistic Regression: ', val_accuracy)

test_predictions = lr_model.predict(test_features)
test_accuracy = accuracy_score(average_test_labels, test_predictions)
print('Test Accuracy with Logistic Regression:', test_accuracy)

Validation Accuracy with Logistic Regression:  0.38877755511022044
Test Accuracy with Logistic Regression: 0.35935935935935936


In [43]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_features, average_train_labels)

val_predictions = knn_model.predict(val_features)
val_accuracy = accuracy_score(average_val_labels, val_predictions)
print("Validation Accuracy with KNN:", val_accuracy)

test_predictions = knn_model.predict(test_features)
test_accuracy = accuracy_score(average_test_labels, test_predictions)
print("Test Accuracy with KNN:" ,test_accuracy)

Validation Accuracy with KNN: 0.2755511022044088
Test Accuracy with KNN: 0.2757757757757758


In [44]:
#weighted average section

#storing counts of all the words in weights dictionary

weights = {}
count = 0
for i in range(len(train_data)):
    phrase_1 = train_data[i][0]
    phrase_2 = train_data[i][1]
    for word in phrase_1:
        if word not in weights:
            weights[word] = 1
        else:
            weights[word] += 1
        count+=1
    for word in phrase_2:
        if word not in weights:
            weights[word] = 1
        else:
            weights[word] += 1
        count+=1

for i in range(len(val_data)):
    phrase_1 = val_data[i][0]
    phrase_2 = val_data[i][1]
    for word in phrase_1:
        if word not in weights:
            weights[word] = 1
        else:
            weights[word] += 1
        count+=1
    for word in phrase_2:
        if word not in weights:
            weights[word] = 1
        else:
            weights[word] += 1
        count+=1

for i in range(len(test_data)):
    phrase_1 = test_data[i][0]
    phrase_2 = test_data[i][1]
    for word in phrase_1:
        if word not in weights:
            weights[word] = 1
        else:
            weights[word] += 1
        count+=1
    for word in phrase_2:
        if word not in weights:
            weights[word] = 1
        else:
            weights[word] += 1
        count+=1

for i in weights:
  weights[i] = weights[i]/count

count

42146

In [45]:
weights['newly']

0.0002372704408484791

In [46]:
print(len(train_data), len(train_embeddings))


7004 7004


In [47]:
import numpy as np

def weighted_average_embeddings(embeddings, labels, weights, data):
    weighted_average_embeddings = []
    filtered_labels = []
    for i in range(len(embeddings)):
        phrase1 = embeddings[i][0]
        phrase2 = embeddings[i][1]

        phrase_1_words = data[i][0]
        phrase_2_words = data[i][1]

        phrase1_weighted_average = np.zeros(300,)
        phrase2_weighted_average = np.zeros(300,)

        if len(phrase1) > 0:
            for j in range(len(phrase1)):
                phrase1_weighted_average += (weights[phrase_1_words[j]] * phrase1[j]) #multiplied with weights.
            phrase1_weighted_average /= len(phrase1)
        else:
            continue

        if len(phrase2) > 0:
            for j in range(len(phrase2)):
                phrase2_weighted_average += (weights[phrase_2_words[j]] * phrase2[j]) #multiplied with weights
            phrase2_weighted_average /= len(phrase2)
        else:
            continue

        filtered_labels.append(labels[i])
        weighted_average_embeddings.append([phrase1_weighted_average, phrase2_weighted_average])

    return weighted_average_embeddings, filtered_labels


weighted_average_train_embeddings, weighted_average_train_labels = calculate_average_embeddings(train_embeddings, train_labels)
weighted_average_val_embeddings, weighted_average_val_labels = calculate_average_embeddings(val_embeddings, val_labels)
weighted_average_test_embeddings, weighted_average_test_labels = calculate_average_embeddings(test_embeddings, test_labels)

In [48]:
len(train_embeddings[0])

2

In [49]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def flatten_embeddings(average_embeddings):
    flattened_embeddings = []
    for pair in average_embeddings:
        flattened_embeddings.append(np.concatenate(pair))
    return flattened_embeddings

weighted_train_features = flatten_embeddings(weighted_average_train_embeddings)
weighted_val_features = flatten_embeddings(weighted_average_val_embeddings)
weighted_test_features = flatten_embeddings(weighted_average_test_embeddings)

svm_classifier = SVC(max_iter=1000)
svm_classifier.fit(weighted_train_features, weighted_average_train_labels)

weighted_val_predictions = svm_classifier.predict(weighted_val_features)
weighted_test_predictions = svm_classifier.predict(weighted_test_features)

weighted_val_accuracy = accuracy_score(weighted_average_val_labels, weighted_val_predictions)
weighted_test_accuracy = accuracy_score(weighted_average_test_labels, weighted_test_predictions)
print("Validation Accuracy: ", weighted_val_accuracy)
print("Test Accuracy: ", weighted_test_accuracy)



Validation Accuracy:  0.41382765531062127
Test Accuracy:  0.4294294294294294


In [51]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(weighted_train_features, weighted_average_train_labels)

weighted_val_predictions = rf_model.predict(weighted_val_features)
val_accuracy = accuracy_score(weighted_average_val_labels, weighted_val_predictions)
print('Validation Accuracy with Random Forest: ', val_accuracy)

weighted_test_predictions = rf_model.predict(weighted_test_features)
test_accuracy = accuracy_score(weighted_average_test_labels, weighted_test_predictions)
print('Test Accuracy with Random Forest: ', test_accuracy)

Validation Accuracy with Random Forest:  0.14729458917835672
Test Accuracy with Random Forest:  0.16016016016016016


In [52]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(weighted_train_features, weighted_average_train_labels)

weighted_val_predictions = lr_model.predict(weighted_val_features)
val_accuracy = accuracy_score(weighted_average_val_labels, weighted_val_predictions)
print('Validation Accuracy with Logistic Regression: ', val_accuracy)

weighted_test_predictions = lr_model.predict(weighted_test_features)
test_accuracy = accuracy_score(weighted_average_test_labels, weighted_test_predictions)
print('Test Accuracy with Logistic Regression:', test_accuracy)

Validation Accuracy with Logistic Regression:  0.38877755511022044
Test Accuracy with Logistic Regression: 0.35935935935935936


In [53]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(weighted_train_features, weighted_average_train_labels)

weighted_val_predictions = knn_model.predict(weighted_val_features)
val_accuracy = accuracy_score(weighted_average_val_labels, weighted_val_predictions)
print("Validation Accuracy with KNN:", val_accuracy)

weighted_test_predictions = knn_model.predict(weighted_test_features)
test_accuracy = accuracy_score(weighted_average_test_labels, weighted_test_predictions)
print("Test Accuracy with KNN:" ,test_accuracy)

Validation Accuracy with KNN: 0.2755511022044088
Test Accuracy with KNN: 0.2757757757757758
