## Installing Sentence Transsformer and other models/frameworks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sentence_transformers

# Kindly add all your installations and versions if any in this cell.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Importing necessary libraries. 
In the final version all imports should be stricly enlisted here.

In [3]:
import pandas as pd
import numpy as np
import spacy
from scipy import stats
from sklearn import linear_model
from scipy.spatial import distance

from sentence_transformers import SentenceTransformer, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from transformers import AutoTokenizer, AutoModel

import torch 
from torch.utils.data import DataLoader

import requests
import tarfile
import csv

import gensim.downloader

## Load dataset: 7 marks
1 Download and unzip the dataset from this link http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz  **1 mark**

2 Complete the code in `read_sts_csv()`. **4.5 marks**

3 Create 3 dataframes one each for train, test and val and print their final shapes. **1.5 marks**

In [4]:
INPUT_PATH = "/content/drive/MyDrive/Sem-7_NLP/stsbenchmark/"

In [5]:
def download_and_unzip(url, path):
  response = requests.get(url, stream=True)
  file = tarfile.open(fileobj=response.raw, mode="r|gz")
  file.extractall(path=path)

In [6]:
def read_sts_csv(dataset_type="train", columns=['source', 'type', 'year', 'id', 'score', 'sent_a', 'sent_b']):
  path = INPUT_PATH + "sts-"+ dataset_type + ".csv"
  """
  Take the input path and return the dataframe
  """
  df = pd.read_csv(path, names = columns, sep='\t', quoting=csv.QUOTE_NONE, header = None)
  return df

# df_<dataset_type> = read_sts_csv(dataset_type) # create the train, dev and test dataframes

In [7]:
#Downloading and unzipping the dataset
download_and_unzip("http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz", "/content/drive/MyDrive/Sem-7_NLP")

In [8]:
#Creating train, dev, and test dataframes
df_train = read_sts_csv("train")
df_dev = read_sts_csv("dev")
df_test = read_sts_csv("test")

In [9]:
#Printing shapes of the datasets
print(f"Train set shape: {df_train.shape}")
print(f"Dev set shape: {df_dev.shape}")
print(f"Test set shape: {df_test.shape}")

Train set shape: (5749, 7)
Dev set shape: (1500, 7)
Test set shape: (1379, 7)


In [10]:
df_train

Unnamed: 0,source,type,year,id,score,sent_a,sent_b
0,main-captions,MSRvid,2012test,1,5.00,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.80,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.60,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...,...,...,...,...
5744,main-news,headlines,2016,1456,0.00,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia
5745,main-news,headlines,2016,1465,0.00,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...
5746,main-news,headlines,2016,1466,0.00,President heading to Bahrain,President Xi: China to continue help to fight ...
5747,main-news,headlines,2016,1470,0.00,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders


In [11]:
df_dev

Unnamed: 0,source,type,year,id,score,sent_a,sent_b
0,main-captions,MSRvid,2012test,0,5.00,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.
1,main-captions,MSRvid,2012test,2,4.75,A young child is riding a horse.,A child is riding a horse.
2,main-captions,MSRvid,2012test,3,5.00,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.
3,main-captions,MSRvid,2012test,7,2.40,A woman is playing the guitar.,A man is playing guitar.
4,main-captions,MSRvid,2012test,8,2.75,A woman is playing the flute.,A man is playing a flute.
...,...,...,...,...,...,...,...
1495,main-news,headlines,2016,1283,2.00,Scientists prove there is water on Mars,Has Nasa discovered water on Mars?
1496,main-news,headlines,2016,1434,0.00,Pranab stresses need to strive for peace by na...,WTO: India regrets action of developed nations
1497,main-news,headlines,2016,1440,2.00,Volkswagen skids into red in wake of pollution...,"Volkswagen's ""gesture of goodwill"" to diesel o..."
1498,main-news,headlines,2016,1477,0.00,Obama is right: Africa deserves better leadership,Obama waiting for midterm to name attorney gen...


In [12]:
df_test

Unnamed: 0,source,type,year,id,score,sent_a,sent_b
0,main-captions,MSRvid,2012test,24,2.5,A girl is styling her hair.,A girl is brushing her hair.
1,main-captions,MSRvid,2012test,33,3.6,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.
2,main-captions,MSRvid,2012test,45,5.0,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.
3,main-captions,MSRvid,2012test,63,4.2,A man is cutting up a cucumber.,A man is slicing a cucumber.
4,main-captions,MSRvid,2012test,66,1.5,A man is playing a harp.,A man is playing a keyboard.
...,...,...,...,...,...,...,...
1374,main-news,headlines,2016,1354,0.0,"Philippines, Canada pledge to further boost re...",Philippines saves 100 after ferry sinks
1375,main-news,headlines,2016,1360,1.0,Israel bars Palestinians from Jerusalem's Old ...,"Two-state solution between Palestinians, Israe..."
1376,main-news,headlines,2016,1368,1.0,How much do you know about Secret Service?,Lawmakers from both sides express outrage at S...
1377,main-news,headlines,2016,1420,0.0,Obama Struggles to Soothe Saudi Fears As Iran ...,Myanmar Struggles to Finalize Voter Lists for ...


## Hyperparameters: 5 Marks
Update this cell with you choosen parameters except, NUM_EPOCHS

In [45]:
NON_CONEXTUAL_MODEL_TYPE = 'fasttext'
CONEXTUAL_MODEL_TYPE = 'all-distilroberta-v1'
HUGGING_FACE_SENTENCE_TRANSFORMER_MODEL = 'sentence-transformers/all-distilroberta-v1' # USE THE HUGGAING FACE VERSION OF SENTENCE_TRANSFORMER_TYPE
INPUT_PATH = "/content/drive/MyDrive/Sem-7_NLP/stsbenchmark/"
BATCH_SIZE = 16
OUT_DIM_DENSE = 256
NUM_EPOCHS = 2 ## THIS IS FIXED DO NOT CHANGE

# You are free to add your own hyperparameters as well.

## CONFIGURATION 1: Non-contextual Embeddings + ML Regression: 8 marks
1 Load the non-contextual embedding model in variable `non_cont_model1`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model1()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model1`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model1`. **1.5 mark**



In [None]:
non_cont_model1 = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
def get_feature_model1(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using non_cont_model1,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  matrix_a = np.zeros((data_frame.shape[0], 300))
  matrix_b = np.zeros((data_frame.shape[0], 300))

  for sent_number in range(data_frame.shape[0]):
    sentence_a = data_frame['sent_a'][sent_number].lower().strip('.?!,').split()
    sentence_b = data_frame['sent_b'][sent_number].lower().strip('.?!,').split()

    effective_length_a = 0
    effective_length_b = 0

    for word_a in sentence_a:
      try:
        vec_a = non_cont_model1[word_a]
        matrix_a[sent_number] += vec_a
        effective_length_a += 1
      except:
        continue
    
    for word_b in sentence_b:
      try:
        vec_b = non_cont_model1[word_b]
        matrix_b[sent_number] += vec_b
        effective_length_b += 1
      except:
        continue

    # matrix_a[sent_number] /= np.linalg.norm(matrix_a[sent_number])
    # matrix_b[sent_number] /= np.linalg.norm(matrix_b[sent_number])

    # matrix_a[sent_number] /= effective_length_a
    # matrix_b[sent_number] /= effective_length_b
  
  return matrix_a, matrix_b

In [None]:
feature_1_c1_train, feature_2_c1_train = get_feature_model1(df_train)
feature_1_c1_dev, feature_2_c1_dev = get_feature_model1(df_dev)
feature_1_c1_test, feature_2_c1_test = get_feature_model1(df_test)

In [None]:
X_c1_train = np.zeros((df_train.shape[0], 1))
for i in range(df_train.shape[0]):
  X_c1_train[i] = distance.cosine(feature_1_c1_train[i], feature_2_c1_train[i])
Y_c1_train = df_train['score']

X_c1_dev = np.zeros((df_dev.shape[0], 1))
for i in range(df_dev.shape[0]):
  X_c1_dev[i] = distance.cosine(feature_1_c1_dev[i], feature_2_c1_dev[i])
Y_c1_dev = df_dev['score']

X_c1_test = np.zeros((df_test.shape[0], 1))
for i in range(df_test.shape[0]):
  X_c1_test[i] = distance.cosine(feature_1_c1_test[i], feature_2_c1_test[i])
Y_c1_test = df_test['score']

In [None]:
# Initiate a regression model and train it.
model1 = linear_model.LinearRegression(n_jobs=-1)
model1.fit(X_c1_train, Y_c1_train)

LinearRegression(n_jobs=-1)

In [None]:
configuration_1_predictions_train = model1.predict(X_c1_train)
configuration_1_predictions_dev = model1.predict(X_c1_dev)
configuration_1_predictions_test = model1.predict(X_c1_test)

In [None]:
# Print spearmanr correlation on the predicted output of the dev and test sets.

configuration_1_spearmanr_train = stats.spearmanr(Y_c1_train, configuration_1_predictions_train)
print('Spearman score on train set    raw:{}    rounded-off:{}'.format(configuration_1_spearmanr_train[0], round(configuration_1_spearmanr_train[0], 3)))

configuration_1_spearmanr_dev = stats.spearmanr(Y_c1_dev, configuration_1_predictions_dev)
print('Spearman score on dev set    raw:{}    rounded-off:{}'.format(configuration_1_spearmanr_dev[0], round(configuration_1_spearmanr_dev[0], 3)))

configuration_1_spearmanr_test = stats.spearmanr(Y_c1_test, configuration_1_predictions_test)
print('Spearman score on test set    raw:{}    rounded-off:{}'.format(configuration_1_spearmanr_test[0], round(configuration_1_spearmanr_test[0], 3)))

Spearman score on train set    raw:0.4071938317181819    rounded-off:0.407
Spearman score on dev set    raw:0.47238816488106206    rounded-off:0.472
Spearman score on test set    raw:0.3273576722401695    rounded-off:0.327


## CONFIGURATION 2: Contextual Embeddings + ML Regression: 7 marks
1 Load the contextual embedding model in variable `non_cont_model2`. **1 marks**

2 Get feature for the sentences using the LM model loaded before. Add the code in the `get_feature_model2()` **2 marks**

2 Using features as X and score as Y, train a ML based regression model (`model2`). You are free to choose any sklearn based regression method, and its hyperparameters. **3.5 marks**

3 Print the correlation scores on the dev and test set predictions using trained `model2`. **1.5 mark**

Useful references: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

In [None]:
non_cont_model2 = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
def get_feature_model2(data_frame):
  """
  Input a data frame and return the embedding vectors for the each sentence column using model2,
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  matrix_a = np.zeros((data_frame.shape[0], 768))
  matrix_b = np.zeros((data_frame.shape[0], 768))

  matrix_a = non_cont_model2.encode(data_frame['sent_a'])
  matrix_b = non_cont_model2.encode(data_frame['sent_b'])

  return matrix_a, matrix_b

In [None]:
feature_1_c2_train, feature_2_c2_train = get_feature_model2(df_train)
feature_1_c2_dev, feature_2_c2_dev = get_feature_model2(df_dev)
feature_1_c2_test, feature_2_c2_test = get_feature_model2(df_test)

In [None]:
X_c2_train = np.zeros((df_train.shape[0], 1))
for i in range(df_train.shape[0]):
  X_c2_train[i] = distance.cosine(feature_1_c2_train[i], feature_2_c2_train[i])
Y_c2_train = df_train['score']

X_c2_dev = np.zeros((df_dev.shape[0], 1))
for i in range(df_dev.shape[0]):
  X_c2_dev[i] = distance.cosine(feature_1_c2_dev[i], feature_2_c2_dev[i])
Y_c2_dev = df_dev['score']

X_c2_test = np.zeros((df_test.shape[0], 1))
for i in range(df_test.shape[0]):
  X_c2_test[i] = distance.cosine(feature_1_c2_test[i], feature_2_c2_test[i])
Y_c2_test = df_test['score']

In [None]:
# Initiate a regression model and train it.
model2 = linear_model.LinearRegression(n_jobs=-1)
model2.fit(X_c2_train, Y_c2_train)

LinearRegression(n_jobs=-1)

In [None]:
configuration_2_predictions_train = model2.predict(X_c2_train)
configuration_2_predictions_dev = model2.predict(X_c2_dev)
configuration_2_predictions_test = model2.predict(X_c2_test)

In [None]:
# Print spearmanr correlation on the predicted output of the dev and test sets.

configuration_2_spearmanr_train = stats.spearmanr(Y_c2_train, configuration_2_predictions_train)
print('Spearman score on train set    raw:{}    rounded-off:{}'.format(configuration_2_spearmanr_train[0], round(configuration_2_spearmanr_train[0], 3)))

configuration_2_spearmanr_dev = stats.spearmanr(Y_c2_dev, configuration_2_predictions_dev)
print('Spearman score on dev set    raw:{}    rounded-off:{}'.format(configuration_2_spearmanr_dev[0], round(configuration_2_spearmanr_dev[0], 3)))

configuration_2_spearmanr_test = stats.spearmanr(Y_c2_test, configuration_2_predictions_test)
print('Spearman score on test set    raw:{}    rounded-off:{}'.format(configuration_2_spearmanr_test[0], round(configuration_2_spearmanr_test[0], 3)))

Spearman score on train set    raw:0.8288355549774815    rounded-off:0.829
Spearman score on dev set    raw:0.882977524937452    rounded-off:0.883
Spearman score on test set    raw:0.8251810313421228    rounded-off:0.825


## CONFIGURATION 3: Fine-Tune a Contextual Embeddings Model: 18 marks
1 Prepare data samples to be for the DL model to consume. Add the code in the `form_data()`. **4 marks**

3 Create the data loader, one each for train/dev/test data_input sample set obtained from `form_input_example()`. **1.5 marks**

4 Initiate `model3` consisting of **atleast** the following 3 components - `base_LM`, a `pooling_layer` and a `dense_layer`. Use appropriate activation function in dense. **Atleast** one layer of `base_LM` should be set to trainable. **5 marks**

6 Initiate the `loss`. **0.5 marks**

7 Fit the `model3`. Use `NUM_EPOCHS = 2`. **MAX_NUM_EPOCHS allowed will be 3**. **2 marks** 

8 Complete the `get_model_predicts()` to obtain predicted scores for input sentence pairs. **3.5 marks** 

9 Print the correlation scores on the dev and test set predictions. **1.5 mark**

Useful References: https://huggingface.co/blog/how-to-train-sentence-transformers 

In [46]:
def form_data(data_frame):
  """
  Input a data frame and return the dataloder.
  """
  samples = []
  for i in range(data_frame.shape[0]):
    inp_example = InputExample(texts = [data_frame['sent_a'][i], data_frame['sent_b'][i]], label = float(data_frame['score'][i])/5.0)
    samples.append(inp_example)
  
  myDataLoader = DataLoader(samples, batch_size=BATCH_SIZE, shuffle=True)
  return myDataLoader

In [47]:
def get_model_predicts(data_type, trained_model):
  """
  Input the dataset list and return a list of cosine similarity scores. Use the fitted final_trainable_model for obtaining encodings.
  """
  matrix_a = trained_model.encode(data_type['sent_a'])
  matrix_b = trained_model.encode(data_type['sent_b'])

  return matrix_a, matrix_b

In [48]:
dataloader_train = form_data(df_train)
dataloader_dev = form_data(df_dev)
dataloader_test = form_data(df_test)

In [49]:
base_model = models.Transformer('sentence-transformers/all-distilroberta-v1')

In [50]:
layer_pooling = models.Pooling(word_embedding_dimension=768)

In [51]:
layer_dense = models.Dense(in_features=layer_pooling.get_sentence_embedding_dimension(), out_features=OUT_DIM_DENSE, activation_function=torch.nn.Tanh())

In [52]:
model3 = SentenceTransformer(modules=[base_model, layer_pooling, layer_dense])

In [53]:
loss = losses.CosineSimilarityLoss(model=model3)

In [54]:
evaluator = EmbeddingSimilarityEvaluator(list(df_dev['sent_a']), list(df_dev['sent_b']), list(df_dev['score']/5.0))

In [None]:
model3.fit(
    train_objectives = [(dataloader_train, loss)],
    evaluator = evaluator,
    evaluation_steps = 500,
    epochs = NUM_EPOCHS,
    warmup_steps = 100
)

In [56]:
feature_1_c3_train, feature_2_c3_train = get_model_predicts(df_train, model3)
feature_1_c3_dev, feature_2_c3_dev = get_model_predicts(df_dev, model3)
feature_1_c3_test, feature_2_c3_test = get_model_predicts(df_test, model3)

In [57]:
X_c3_train = np.zeros((df_train.shape[0], 1))
for i in range(df_train.shape[0]):
  X_c3_train[i] = distance.cosine(feature_1_c3_train[i], feature_2_c3_train[i])
Y_c3_train = df_train['score']

X_c3_dev = np.zeros((df_dev.shape[0], 1))
for i in range(df_dev.shape[0]):
  X_c3_dev[i] = distance.cosine(feature_1_c3_dev[i], feature_2_c3_dev[i])
Y_c3_dev = df_dev['score']

X_c3_test = np.zeros((df_test.shape[0], 1))
for i in range(df_test.shape[0]):
  X_c3_test[i] = distance.cosine(feature_1_c3_test[i], feature_2_c3_test[i])
Y_c3_test = df_test['score']

In [58]:
# Initiate a regression model and train it.
final_trainable_model = linear_model.LinearRegression(n_jobs=-1)
final_trainable_model.fit(X_c3_train, Y_c3_train)

LinearRegression(n_jobs=-1)

In [59]:
configuration_3_predictions_train = final_trainable_model.predict(X_c3_train)
configuration_3_predictions_dev = final_trainable_model.predict(X_c3_dev)
configuration_3_predictions_test = final_trainable_model.predict(X_c3_test)

In [60]:
# Print spearmanr correlation on the predicted output of the dev and test sets.

configuration_3_spearmanr_train = stats.spearmanr(Y_c3_train, configuration_3_predictions_train)
print('Spearman score on train set    raw:{}    rounded-off:{}'.format(configuration_3_spearmanr_train[0], round(configuration_3_spearmanr_train[0], 3)))

configuration_3_spearmanr_dev = stats.spearmanr(Y_c3_dev, configuration_3_predictions_dev)
print('Spearman score on dev set    raw:{}    rounded-off:{}'.format(configuration_3_spearmanr_dev[0], round(configuration_3_spearmanr_dev[0], 3)))

configuration_3_spearmanr_test = stats.spearmanr(Y_c3_test, configuration_3_predictions_test)
print('Spearman score on test set    raw:{}    rounded-off:{}'.format(configuration_3_spearmanr_test[0], round(configuration_3_spearmanr_test[0], 3)))

Spearman score on train set    raw:0.9474680981508599    rounded-off:0.947
Spearman score on dev set    raw:0.8976547645906917    rounded-off:0.898
Spearman score on test set    raw:0.8560051114650525    rounded-off:0.856
