In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
train_df = pd.read_csv('./data/stsb-en-train.csv', usecols=[0,1,2], names=['sent1', 'sent2', 'score'], header=None)

In [12]:
train_df.head()

Unnamed: 0,sent1,sent2,score
0,A plane is taking off.,An air plane is taking off.,5.0
1,A man is playing a large flute.,A man is playing a flute.,3.8
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,Three men are playing chess.,Two men are playing chess.,2.6
4,A man is playing the cello.,A man seated is playing the cello.,4.25


In [13]:
import re
pattern = r'[^\w\s]'

In [14]:
train_df['sent1'] =train_df['sent1'].apply(lambda x: re.sub(pattern, '', x))
train_df['sent2'] =train_df['sent2'].apply(lambda x: re.sub(pattern, '', x))

In [15]:
train_df.head()

Unnamed: 0,sent1,sent2,score
0,A plane is taking off,An air plane is taking off,5.0
1,A man is playing a large flute,A man is playing a flute,3.8
2,A man is spreading shreded cheese on a pizza,A man is spreading shredded cheese on an uncoo...,3.8
3,Three men are playing chess,Two men are playing chess,2.6
4,A man is playing the cello,A man seated is playing the cello,4.25


In [16]:
val_df = pd.read_csv('./data/stsb-en-dev.csv', usecols=[0,1,2], names=['sent1', 'sent2', 'score'], header=None)

In [17]:
val_df.head()

Unnamed: 0,sent1,sent2,score
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0
1,A young child is riding a horse.,A child is riding a horse.,4.75
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0
3,A woman is playing the guitar.,A man is playing guitar.,2.4
4,A woman is playing the flute.,A man is playing a flute.,2.75


In [18]:
val_df['sent1'] =val_df['sent1'].apply(lambda x: re.sub(pattern, '', x))
val_df['sent2'] =val_df['sent2'].apply(lambda x: re.sub(pattern, '', x))

In [19]:
val_df.head()

Unnamed: 0,sent1,sent2,score
0,A man with a hard hat is dancing,A man wearing a hard hat is dancing,5.0
1,A young child is riding a horse,A child is riding a horse,4.75
2,A man is feeding a mouse to a snake,The man is feeding a mouse to the snake,5.0
3,A woman is playing the guitar,A man is playing guitar,2.4
4,A woman is playing the flute,A man is playing a flute,2.75


In [70]:
test_df = pd.read_csv('./data/stsb-en-test.csv', usecols=[0,1,2], names=['sent1', 'sent2', 'score'], header=None)

In [71]:
test_df.head()

Unnamed: 0,sent1,sent2,score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


In [72]:
test_df['sent1'] =test_df['sent1'].apply(lambda x: re.sub(pattern, '', x))
test_df['sent2'] =test_df['sent2'].apply(lambda x: re.sub(pattern, '', x))

In [20]:
len(train_df)

5749

In [21]:
len(val_df)

1500

In [73]:
len(test_df)

1379

In [22]:
total_sents = list(train_df['sent1'])

In [23]:
total_sents.extend(list(train_df['sent2']))

In [26]:
total_sents[0:5]

['A plane is taking off',
 'A man is playing a large flute',
 'A man is spreading shreded cheese on a pizza',
 'Three men are playing chess',
 'A man is playing the cello']

In [27]:
len(total_sents)

11498

In [28]:
total_sents_df = pd.Series(total_sents)

In [29]:
len(total_sents_df)

11498

In [39]:
vectorizer = TfidfVectorizer()

In [52]:
X_train_sents = vectorizer.fit_transform(total_sents_df)

In [54]:
X_train_sentence1 = vectorizer.transform(train_df['sent1'])
X_train_sentence2 = vectorizer.transform(train_df['sent2'])

In [56]:
X_test_sentence1 = vectorizer.transform(val_df['sent1'])
X_test_sentence2 = vectorizer.transform(val_df['sent2'])

In [59]:
# Combine the sentence vectors for training and testing data
X_train_vectorized = np.hstack((X_train_sentence1.toarray(), X_train_sentence2.toarray()))
X_test_vectorized = np.hstack((X_test_sentence1.toarray(), X_test_sentence2.toarray()))

In [61]:
y_train = train_df['score']
y_test = val_df['score']

In [63]:
model = LinearRegression()
model.fit(X_train_vectorized, y_train)

In [64]:
score = model.score(X_test_vectorized, y_test)
print(f"Model score on testing dataset: {score}")

Model score on testing dataset: -2.935174642127101e+24


In [65]:
model.predict(X_test_vectorized)

array([ 1.78114089e+12,  6.83919270e+11, -4.37458258e+11, ...,
       -2.53443202e+11,  1.29814417e+12, -2.23139544e+12])

In [66]:
# Normalize the target variable to be in the range of 0 to 1
y_train_normalized = (y_train - y_train.min()) / (y_train.max() - y_train.min())

# Train the model on the normalized target variable
model = LinearRegression()
model.fit(X_train_vectorized, y_train_normalized)

In [67]:
# Predict normalized similarity scores for the testing dataset
y_pred_normalized = model.predict(X_test_vectorized)

# Scale the normalized predictions back to the original range of 0 to 5
y_pred = y_pred_normalized * (y_train.max() - y_train.min()) + y_train.min()

# Evaluate the model's performance on the testing dataset
score = model.score(X_test_vectorized, y_test)
print(f"Model score on testing dataset: {score}")

Model score on testing dataset: -1.1740698568508581e+23


In [59]:
from scipy.stats import pearsonr
def pearson_corr(y_true, y_pred):
    """
    Calculate Pearson correlation coefficient between two arrays.
    """
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [71]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_test, y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: -0.02


## Word2vec - Doc2vec

In [13]:
total_sents[0].split()

['A', 'plane', 'is', 'taking', 'off.']

In [40]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial

In [32]:
documents = []
for idx, sent in enumerate(total_sents):
    tokens = sent.split()
    documents.append(TaggedDocument(tokens, [idx]))
    
    

In [34]:
documents[0:5]

[TaggedDocument(words=['A', 'plane', 'is', 'taking', 'off'], tags=[0]),
 TaggedDocument(words=['A', 'man', 'is', 'playing', 'a', 'large', 'flute'], tags=[1]),
 TaggedDocument(words=['A', 'man', 'is', 'spreading', 'shreded', 'cheese', 'on', 'a', 'pizza'], tags=[2]),
 TaggedDocument(words=['Three', 'men', 'are', 'playing', 'chess'], tags=[3]),
 TaggedDocument(words=['A', 'man', 'is', 'playing', 'the', 'cello'], tags=[4])]

In [137]:
# Build and train the Doc2Vec model - This needs further fine tuning
model = Doc2Vec(documents, vector_size=50, window=2, min_count=1, workers=4, epochs=10, seed=42)

In [80]:
# Save the trained model to disk
#model.save('doc2vec_model_v1')

# Load the saved model from disk
#model = Doc2Vec.load('doc2vec_model_v1')

In [138]:
model.random.seed(42)

In [139]:
import random
random.seed(42)

In [140]:
train_df['sent1'][0].split()

['A', 'plane', 'is', 'taking', 'off']

In [145]:
# Generate sentence embeddings
embedding1 = model.infer_vector(train_df['sent1'][0].split())
embedding2 = model.infer_vector(train_df['sent2'][0].split())

# Compare the sentence embeddings using cosine similarity
similarity = 1 - spatial.distance.cosine(embedding1, embedding2)

In [146]:
similarity

0.8392412066459656

In [84]:
len(embedding2)

50

In [85]:
len(embedding1)

50

In [86]:
def sts_score(sim_score):
    sts_score = (sim_score+1) * 2.5
    return sts_score

In [87]:
print(sts_score(similarity))

4.778210520744324


In [160]:
train_df['sent1_embedding'] = train_df['sent1'].apply(lambda x: model.infer_vector(x.split()))

In [161]:
train_df['sent2_embedding'] = train_df['sent2'].apply(lambda x: model.infer_vector(x.split()))

In [162]:
train_df['y_pred'] = train_df.apply(lambda x: sts_score(1 - spatial.distance.cosine(x['sent1_embedding'], x['sent2_embedding'])), axis=1)

In [150]:
train_df.head()

Unnamed: 0,sent1,sent2,score,sent1_embedding,sent2_embedding,y_pred
0,A plane is taking off,An air plane is taking off,5.0,"[0.037235305, 0.049875773, -0.0409201, -0.0306...","[0.020289073, -0.005480495, -0.021858064, -0.0...",3.983521
1,A man is playing a large flute,A man is playing a flute,3.8,"[0.013077635, -0.002221671, -0.022697706, 0.00...","[0.03319191, 0.026161613, -0.015830735, 0.0044...",4.4389
2,A man is spreading shreded cheese on a pizza,A man is spreading shredded cheese on an uncoo...,3.8,"[0.005028709, -0.08395849, -0.085317075, -0.00...","[0.02409193, -0.059204135, -0.07484605, -0.011...",4.926448
3,Three men are playing chess,Two men are playing chess,2.6,"[0.0039357822, -0.0368405, -0.05056638, 0.0430...","[-0.0051369094, -0.037136845, -0.05966277, 0.0...",4.858101
4,A man is playing the cello,A man seated is playing the cello,4.25,"[0.019301675, 0.0137491645, -0.02047302, 0.003...","[0.004846013, -0.006223194, -0.024362322, 0.01...",4.200083


In [163]:
y_pred = train_df['y_pred']

In [164]:
y_train = train_df['score']

In [165]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_train, y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.23


In [166]:
val_df['sent1_embedding'] = val_df['sent1'].apply(lambda x: model.infer_vector(x.split()))
val_df['sent2_embedding'] = val_df['sent2'].apply(lambda x: model.infer_vector(x.split()))

In [167]:
val_df['y_pred'] = val_df.apply(lambda x: sts_score(1 - spatial.distance.cosine(x['sent1_embedding'], x['sent2_embedding'])), axis=1)

In [168]:
val_df.head()

Unnamed: 0,sent1,sent2,score,sent1_embedding,sent2_embedding,y_pred
0,A man with a hard hat is dancing,A man wearing a hard hat is dancing,5.0,"[0.029126052, 0.0043316777, -0.03920573, -0.00...","[0.039069068, 0.018714555, -0.027640227, -0.00...",4.893406
1,A young child is riding a horse,A child is riding a horse,4.75,"[0.02854366, 0.03102283, -0.022809574, -0.0367...","[0.02017651, -0.012082314, -0.012686647, 0.001...",4.090671
2,A man is feeding a mouse to a snake,The man is feeding a mouse to the snake,5.0,"[0.009256553, -0.03530969, -0.04006245, -0.016...","[-0.009849704, -0.046917006, -0.043530017, -0....",4.692041
3,A woman is playing the guitar,A man is playing guitar,2.4,"[0.001357124, 0.0037358047, -0.01523715, -0.00...","[0.0008167301, -0.0063778735, -0.009220612, 0....",4.381824
4,A woman is playing the flute,A man is playing a flute,2.75,"[0.0050487723, -0.0055567487, -0.0055881646, -...","[0.023505239, 0.0069387676, -0.01638708, 0.012...",4.481647


In [169]:
y_pred = val_df['y_pred']
y_val = val_df['score']

In [170]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_val, y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.19


In [186]:
test_df['sent1_embedding'] = test_df['sent1'].apply(lambda x: model.infer_vector(x.split()))
test_df['sent2_embedding'] = test_df['sent2'].apply(lambda x: model.infer_vector(x.split()))

In [187]:
test_df['y_pred'] = test_df.apply(lambda x: sts_score(1 - spatial.distance.cosine(x['sent1_embedding'], x['sent2_embedding'])), axis=1)

In [188]:
test_df.head()

Unnamed: 0,sent1,sent2,score,sent1_embedding,sent2_embedding,y_pred
0,A girl is styling her hair,A girl is brushing her hair,2.5,"[-0.0020142705, -0.019337067, -0.032497767, -0...","[-0.006094001, -0.014797223, -0.040190816, -0....",4.781613
1,A group of men play soccer on the beach,A group of boys are playing soccer on the beach,3.6,"[0.07079234, 0.030786736, -0.057997897, 0.0210...","[0.07462172, 0.073791735, -0.044912428, 0.0238...",4.825599
2,One woman is measuring another womans ankle,A woman measures another womans ankle,5.0,"[-0.0053328946, -0.017211435, 0.004524707, 0.0...","[-0.0038811609, -0.023219023, -0.045904495, -0...",3.665416
3,A man is cutting up a cucumber,A man is slicing a cucumber,4.2,"[0.025645258, 0.04911593, -0.01340625, -0.0116...","[0.006795717, 0.0065430827, -0.019682791, -0.0...",4.536721
4,A man is playing a harp,A man is playing a keyboard,1.5,"[-0.00018151841, 0.01334669, -0.015150716, 0.0...","[0.008341481, -0.003067909, -0.009228635, -0.0...",4.300284


In [189]:
y_pred = test_df['y_pred']
y_test = test_df['score']

In [190]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_test, y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.15
