In [60]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [34]:
train_df = pd.read_csv('./data/stsb-en-train.csv', usecols=[0,1,2], names=['sent1', 'sent2', 'score'], header=None)

In [35]:
train_df.head()

Unnamed: 0,sent1,sent2,score
0,A plane is taking off.,An air plane is taking off.,5.0
1,A man is playing a large flute.,A man is playing a flute.,3.8
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,Three men are playing chess.,Two men are playing chess.,2.6
4,A man is playing the cello.,A man seated is playing the cello.,4.25


In [36]:
val_df = pd.read_csv('./data/stsb-en-dev.csv', usecols=[0,1,2], names=['sent1', 'sent2', 'score'], header=None)

In [37]:
val_df.head()

Unnamed: 0,sent1,sent2,score
0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.,5.0
1,A young child is riding a horse.,A child is riding a horse.,4.75
2,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.,5.0
3,A woman is playing the guitar.,A man is playing guitar.,2.4
4,A woman is playing the flute.,A man is playing a flute.,2.75


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
vectorizer = TfidfVectorizer()

In [40]:
len(train_df)

5749

In [41]:
len(val_df)

1500

In [42]:
total_sents = list(train_df['sent1'])

In [43]:
total_sents.extend(list(train_df['sent2']))

In [44]:
total_sents

['A plane is taking off.',
 'A man is playing a large flute.',
 'A man is spreading shreded cheese on a pizza.',
 'Three men are playing chess.',
 'A man is playing the cello.',
 'Some men are fighting.',
 'A man is smoking.',
 'The man is playing the piano.',
 'A man is playing on a guitar and singing.',
 'A person is throwing a cat on to the ceiling.',
 'The man hit the other man with a stick.',
 'A woman picks up and holds a baby kangaroo.',
 'A man is playing a flute.',
 'A person is folding a piece of paper.',
 'A man is running on the road.',
 'A dog is trying to get bacon off his back.',
 'The polar bear is sliding on the snow.',
 'A woman is writing.',
 "A cat is rubbing against baby's face.",
 'The man is riding a horse.',
 'A man pours oil into a pot.',
 'A man is playing a guitar.',
 'A panda is sliding down a slide.',
 'A woman is eating something.',
 'A woman peels a potato.',
 'The boy fell off his bike.',
 'The woman is playing the flute.',
 'A rabbit is running from an 

In [45]:
len(total_sents)

11498

In [46]:
type(train_df['sent1'])

pandas.core.series.Series

In [49]:
total_sents_df = pd.Series(total_sents)

In [51]:
len(total_sents_df)

11498

In [52]:
X_train_sents = vectorizer.fit_transform(total_sents_df)

In [54]:
X_train_sentence1 = vectorizer.transform(train_df['sent1'])
X_train_sentence2 = vectorizer.transform(train_df['sent2'])

In [56]:
X_test_sentence1 = vectorizer.transform(val_df['sent1'])
X_test_sentence2 = vectorizer.transform(val_df['sent2'])

In [59]:
# Combine the sentence vectors for training and testing data
X_train_vectorized = np.hstack((X_train_sentence1.toarray(), X_train_sentence2.toarray()))
X_test_vectorized = np.hstack((X_test_sentence1.toarray(), X_test_sentence2.toarray()))

In [61]:
y_train = train_df['score']
y_test = val_df['score']

In [63]:
model = LinearRegression()
model.fit(X_train_vectorized, y_train)

In [64]:
score = model.score(X_test_vectorized, y_test)
print(f"Model score on testing dataset: {score}")

Model score on testing dataset: -2.935174642127101e+24


In [65]:
model.predict(X_test_vectorized)

array([ 1.78114089e+12,  6.83919270e+11, -4.37458258e+11, ...,
       -2.53443202e+11,  1.29814417e+12, -2.23139544e+12])

In [66]:
# Normalize the target variable to be in the range of 0 to 1
y_train_normalized = (y_train - y_train.min()) / (y_train.max() - y_train.min())

# Train the model on the normalized target variable
model = LinearRegression()
model.fit(X_train_vectorized, y_train_normalized)

In [67]:
# Predict normalized similarity scores for the testing dataset
y_pred_normalized = model.predict(X_test_vectorized)

# Scale the normalized predictions back to the original range of 0 to 5
y_pred = y_pred_normalized * (y_train.max() - y_train.min()) + y_train.min()

# Evaluate the model's performance on the testing dataset
score = model.score(X_test_vectorized, y_test)
print(f"Model score on testing dataset: {score}")

Model score on testing dataset: -1.1740698568508581e+23


In [70]:
from scipy.stats import pearsonr
def pearson_corr(y_true, y_pred):
    """
    Calculate Pearson correlation coefficient between two arrays.
    """
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [71]:
# Calculate Pearson correlation coefficient between predicted values and target values
corr = pearson_corr(y_test, y_pred)

# Print the correlation coefficient
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: -0.02
