#Using the dataset to find which 5 sentences resemble each other the most.
###The dataset contains 500 sentences, and in those 500 sentences, there are 100 groups with 5 sentences each. The concepts used in this example are text preprocessing steps such as Tokenizing and Lemmetizing, as well as a statiscal comparison concept called cosine similarity. By the end, the output will result in 100 groups, each containing 5 sentences that are similar in structure and topic.

###Getting the data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json

In [None]:
data = json.load(open('/content/sample_data/text_cosine_sim.json'))

In [None]:
df = pd.DataFrame(data)

In [None]:
data[:10]

['<start> A man rows a boat against a night sky . <end>',
 '<start> Two dogs running through a field . <end>',
 '<start> A small boy wearing a red helmet rides his bicycle down a patterned path . <end>',
 '<start> A black dog standing in a shallow area of water on a rocky beach . <end>',
 '<start> Two white dogs are running together . <end>',
 '<start> Two children carry flowers as they walk along a grassy track . <end>',
 '<start> Several young people sitting on a rail above a crowded beach . <end>',
 '<start> two kids playing on the beach , close to the water <end>',
 '<start> Two dogs are playing outside in a field . <end>',
 '<start> A small dog in front of a yellow tube on grass . <end>']

In [None]:
def clean_sentence(sentence):
  return sentence.replace('<start>', '').replace('<end>', '').strip()

df[0] = df[0].apply(clean_sentence)

In [None]:
df.head()

Unnamed: 0,0
0,A man rows a boat against a night sky .
1,Two dogs running through a field .
2,A small boy wearing a red helmet rides his bic...
3,A black dog standing in a shallow area of wate...
4,Two white dogs are running together .


#Preprocessing the data

###Packages needed for preprocessing

In [None]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

###Preprocessing data via stop words removal, puncuation removal, making everything lowercase, tokenizing & lemmetizing

In [None]:
import string
string.punctuation

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
df['clean_msg']= df[0].apply(lambda x:remove_punctuation(x))
df.head()

Unnamed: 0,0,clean_msg
0,A man rows a boat against a night sky .,A man rows a boat against a night sky
1,Two dogs running through a field .,Two dogs running through a field
2,A small boy wearing a red helmet rides his bic...,A small boy wearing a red helmet rides his bic...
3,A black dog standing in a shallow area of wate...,A black dog standing in a shallow area of wate...
4,Two white dogs are running together .,Two white dogs are running together


In [None]:
df['msg_lower']= df['clean_msg'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,0,clean_msg,msg_lower
0,A man rows a boat against a night sky .,A man rows a boat against a night sky,a man rows a boat against a night sky
1,Two dogs running through a field .,Two dogs running through a field,two dogs running through a field
2,A small boy wearing a red helmet rides his bic...,A small boy wearing a red helmet rides his bic...,a small boy wearing a red helmet rides his bic...
3,A black dog standing in a shallow area of wate...,A black dog standing in a shallow area of wate...,a black dog standing in a shallow area of wate...
4,Two white dogs are running together .,Two white dogs are running together,two white dogs are running together


In [None]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

#applying function to the column
df['msg_tokenied']= df['msg_lower'].apply(lambda x: tokenization(x))

df.head()

Unnamed: 0,0,clean_msg,msg_lower,msg_tokenied
0,A man rows a boat against a night sky .,A man rows a boat against a night sky,a man rows a boat against a night sky,[a man rows a boat against a night sky ]
1,Two dogs running through a field .,Two dogs running through a field,two dogs running through a field,[two dogs running through a field ]
2,A small boy wearing a red helmet rides his bic...,A small boy wearing a red helmet rides his bic...,a small boy wearing a red helmet rides his bic...,[a small boy wearing a red helmet rides his bi...
3,A black dog standing in a shallow area of wate...,A black dog standing in a shallow area of wate...,a black dog standing in a shallow area of wate...,[a black dog standing in a shallow area of wat...
4,Two white dogs are running together .,Two white dogs are running together,two white dogs are running together,[two white dogs are running together ]


In [None]:
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text.split() if i not in stopwords]
    return output

#converting the 'msg_tokenied' column to a list of strings
df['msg_tokenied'] = df['msg_tokenied'].apply(lambda x: ' '.join(x))

#applying the function
df['no_stopwords']= df['msg_tokenied'].apply(lambda x:remove_stopwords(x))

df.head()

Unnamed: 0,0,clean_msg,msg_lower,msg_tokenied,no_stopwords
0,A man rows a boat against a night sky .,A man rows a boat against a night sky,a man rows a boat against a night sky,a man rows a boat against a night sky,"[man, rows, boat, night, sky]"
1,Two dogs running through a field .,Two dogs running through a field,two dogs running through a field,two dogs running through a field,"[two, dogs, running, field]"
2,A small boy wearing a red helmet rides his bic...,A small boy wearing a red helmet rides his bic...,a small boy wearing a red helmet rides his bic...,a small boy wearing a red helmet rides his bic...,"[small, boy, wearing, red, helmet, rides, bicy..."
3,A black dog standing in a shallow area of wate...,A black dog standing in a shallow area of wate...,a black dog standing in a shallow area of wate...,a black dog standing in a shallow area of wate...,"[black, dog, standing, shallow, area, water, r..."
4,Two white dogs are running together .,Two white dogs are running together,two white dogs are running together,two white dogs are running together,"[two, white, dogs, running, together]"


In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text
df['msg_lemmatized']=df['no_stopwords'].apply(lambda x:lemmatizer(x))

df.head()

Unnamed: 0,0,clean_msg,msg_lower,msg_tokenied,no_stopwords,msg_lemmatized
0,A man rows a boat against a night sky .,A man rows a boat against a night sky,a man rows a boat against a night sky,a man rows a boat against a night sky,"[man, rows, boat, night, sky]","[man, row, boat, night, sky]"
1,Two dogs running through a field .,Two dogs running through a field,two dogs running through a field,two dogs running through a field,"[two, dogs, running, field]","[two, dog, running, field]"
2,A small boy wearing a red helmet rides his bic...,A small boy wearing a red helmet rides his bic...,a small boy wearing a red helmet rides his bic...,a small boy wearing a red helmet rides his bic...,"[small, boy, wearing, red, helmet, rides, bicy...","[small, boy, wearing, red, helmet, ride, bicyc..."
3,A black dog standing in a shallow area of wate...,A black dog standing in a shallow area of wate...,a black dog standing in a shallow area of wate...,a black dog standing in a shallow area of wate...,"[black, dog, standing, shallow, area, water, r...","[black, dog, standing, shallow, area, water, r..."
4,Two white dogs are running together .,Two white dogs are running together,two white dogs are running together,two white dogs are running together,"[two, white, dogs, running, together]","[two, white, dog, running, together]"


In [None]:
df_clean = df.drop(columns = ['clean_msg', 'msg_lower', 'msg_tokenied', 'no_stopwords',0], axis = 1)

In [None]:
df_clean

Unnamed: 0,msg_lemmatized
0,"[man, row, boat, night, sky]"
1,"[two, dog, running, field]"
2,"[small, boy, wearing, red, helmet, ride, bicyc..."
3,"[black, dog, standing, shallow, area, water, r..."
4,"[two, white, dog, running, together]"
...,...
495,"[dog, pee, side, fountain, another, dog, sniff..."
496,"[naked, cyclist, body, paint, ride, fair]"
497,"[young, boy, red, jacket, stand, pier, fishing..."
498,"[two, teenage, boy, black, swim, trunk, play, ..."


### Embedding the Dataframe using HuggingFace and Cosine Similarity

In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import requests

model_id = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
hf_token = "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"

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

In [None]:
clean_list = df_clean.msg_lemmatized.values.tolist()

In [None]:
embeddings = model_id.encode(clean_list)

In [None]:
embeddings

array([[-0.07689139,  0.00770481, -0.10853241, ..., -0.00246774,
        -0.00958476,  0.06447588],
       [-0.00480425,  0.00706998,  0.00176357, ...,  0.0257101 ,
         0.06779863,  0.04190665],
       [-0.00998503,  0.13586283, -0.06358889, ...,  0.03945745,
        -0.02927957, -0.02571913],
       ...,
       [-0.05008044,  0.0845782 , -0.06225432, ...,  0.03660065,
         0.02111953, -0.02488592],
       [ 0.02321813,  0.06567498, -0.04918887, ...,  0.00776033,
         0.0480459 , -0.02588855],
       [-0.1427026 ,  0.08211615, -0.0939654 , ..., -0.02043889,
         0.02893897, -0.07106553]], dtype=float32)

In [None]:
embeddings_df = pd.DataFrame(embeddings)

In [None]:
embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.076891,0.007705,-0.108532,-0.050881,-0.035976,-0.038728,0.096930,-0.047266,0.062420,0.006144,...,0.066822,0.053679,-0.047086,0.037311,0.023224,0.017602,0.083807,-0.002468,-0.009585,0.064476
1,-0.004804,0.007070,0.001764,0.063856,-0.109481,0.005896,0.057539,-0.004777,0.142790,-0.004816,...,-0.006676,-0.057505,-0.025194,-0.032500,-0.012929,0.037728,0.094200,0.025710,0.067799,0.041907
2,-0.009985,0.135863,-0.063589,-0.004577,-0.017755,-0.061733,0.122463,0.049083,-0.036147,0.014010,...,0.101217,-0.039260,-0.006624,0.063269,-0.069282,-0.019087,0.097086,0.039457,-0.029280,-0.025719
3,-0.080706,0.049601,-0.101516,0.078982,-0.032140,0.046386,0.107744,-0.023313,0.111290,-0.044842,...,0.013612,-0.087143,0.020784,-0.004069,-0.046191,0.035512,0.131344,-0.018692,-0.024333,0.055299
4,0.021236,0.040587,-0.091409,0.040727,-0.043032,-0.025294,0.081732,0.023443,0.067391,-0.006983,...,0.014138,-0.115509,0.030021,0.031452,-0.046546,0.017271,0.107470,-0.000683,0.053986,0.011645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.037873,0.005519,0.068155,0.046334,-0.137009,-0.023870,0.162592,-0.018227,0.136886,-0.038709,...,0.073145,0.020276,-0.010256,-0.081920,-0.014543,0.038579,0.170776,0.086327,0.022679,0.047811
496,0.021879,0.143314,-0.032774,0.045223,0.043429,-0.035142,0.061510,0.030242,0.015146,0.001659,...,0.053672,-0.007902,-0.043233,0.097561,-0.097235,0.080353,0.011610,0.001136,-0.090239,-0.030895
497,-0.050080,0.084578,-0.062254,-0.035472,0.001884,-0.006641,0.122387,0.036016,0.029189,0.003632,...,0.130582,-0.087504,-0.034845,0.055108,-0.059068,0.037318,0.183346,0.036601,0.021120,-0.024886
498,0.023218,0.065675,-0.049189,-0.032083,-0.075113,-0.054973,0.032310,0.007272,0.071210,0.000134,...,0.093823,-0.057369,-0.045225,0.071365,-0.070099,-0.042509,0.147010,0.007760,0.048046,-0.025889


###Using cosine similarity to normalize the data and find the most similar sentences based on vectors of an inner product space.

In [None]:
df_cs = pd.DataFrame(cosine_similarity(X=embeddings))

In [None]:
df_cs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,1.000000,0.230578,0.292304,0.301875,0.292143,0.204113,0.182456,0.235894,0.230578,0.201514,...,0.217749,0.301875,0.322502,0.279350,0.230578,0.251100,0.175826,0.322502,0.229102,0.356499
1,0.230578,1.000000,0.328268,0.640684,0.646493,0.638058,0.407267,0.641915,1.000000,0.600439,...,0.456252,0.640684,0.365389,0.619409,1.000000,0.557298,0.150124,0.365389,0.626579,0.327191
2,0.292304,0.328268,1.000000,0.416659,0.350917,0.499912,0.348369,0.492883,0.328268,0.770148,...,0.223500,0.416659,0.691755,0.403472,0.328268,0.270508,0.210554,0.691755,0.429003,0.537318
3,0.301875,0.640684,0.416659,1.000000,0.558221,0.351065,0.272592,0.350677,0.640684,0.600464,...,0.230866,1.000000,0.437234,0.770321,0.640684,0.546939,0.214890,0.437234,0.341334,0.447735
4,0.292143,0.646493,0.350917,0.558221,1.000000,0.626725,0.484693,0.620456,0.646493,0.366460,...,0.483807,0.558221,0.373787,0.485214,0.646493,0.230189,0.197831,0.373787,0.603476,0.378563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.251100,0.557298,0.270508,0.546939,0.230189,0.216966,0.144690,0.211334,0.557298,0.492624,...,0.138499,0.546939,0.294854,0.531529,0.557298,1.000000,0.262886,0.294854,0.250367,0.280411
496,0.175826,0.150124,0.210554,0.214890,0.197831,0.186914,0.149883,0.179821,0.150124,0.178771,...,0.137595,0.214890,0.243909,0.175489,0.150124,0.262886,1.000000,0.243909,0.245247,0.272327
497,0.322502,0.365389,0.691755,0.437234,0.373787,0.560768,0.605859,0.619451,0.365389,0.452757,...,0.205085,0.437234,1.000000,0.416028,0.365389,0.294854,0.243909,1.000000,0.577205,0.562396
498,0.229102,0.626579,0.429003,0.341334,0.603476,0.663754,0.654717,0.758618,0.626579,0.384653,...,0.467285,0.341334,0.577205,0.353054,0.626579,0.250367,0.245247,0.577205,1.000000,0.392673


####To explain the dataframe above in more detail, it is displaying values (from 0 to 1) of how similar each index is to each column. For instance, row 0 and column 0 are the same sentence and will result in 1. Where as combinations of row 0 and column 1 and vice versa will have the same comparison.

###Now, we will  enumerate over the data to create lists that present


1.   The row number
2.   The index position
3.   The cosine similarity value

###Then create another list showing the maximum value for each row in the dataset

In [None]:
t=[]

# Part 01: Create List of lists that shows 1. row number, 2. index position, 3. similarity value
for j,k in enumerate(df_cs.values):
    for n in range(len(k)):
        t.append([j,n,k[n]])

# Part 02: Removing the 1 for a 0 for when index and row are same
qq=[]
for i in range(len(t)):
    if t[i][0]==t[i][1]:
        qq.append([t[i][0],t[i][1],0])
    else:
        qq.append(t[i])
qq[:5]

[[0, 0, 0],
 [0, 1, 0.23057798],
 [0, 2, 0.29230407],
 [0, 3, 0.30187476],
 [0, 4, 0.29214323]]

In [None]:
from collections import defaultdict
u=defaultdict(list)

# Part 01:

for i in range(len(qq)):
    u[qq[i][0]].append(qq[i][2])

updated_df=pd.DataFrame(u)

# updated_df.max(axis=1)
# max(updated_df[0])
# np.argmax(updated_df[3])
# updated_df[3]

# Part 02:

position_maxVal=[]
for i in range(len(updated_df)):
    position_maxVal.append(np.argmax(updated_df[i]))

In [None]:
position_maxVal

[27,
 8,
 92,
 46,
 220,
 124,
 465,
 72,
 494,
 466,
 170,
 489,
 74,
 37,
 31,
 485,
 336,
 413,
 98,
 455,
 313,
 108,
 51,
 90,
 112,
 88,
 372,
 42,
 231,
 156,
 67,
 14,
 265,
 451,
 305,
 262,
 114,
 13,
 144,
 126,
 86,
 57,
 27,
 139,
 403,
 483,
 491,
 48,
 371,
 130,
 13,
 22,
 284,
 125,
 127,
 10,
 226,
 382,
 39,
 196,
 25,
 263,
 27,
 38,
 308,
 367,
 194,
 30,
 126,
 490,
 117,
 347,
 7,
 116,
 12,
 460,
 159,
 183,
 165,
 22,
 379,
 292,
 188,
 433,
 193,
 365,
 481,
 490,
 25,
 438,
 228,
 479,
 2,
 494,
 20,
 158,
 388,
 61,
 420,
 299,
 168,
 339,
 104,
 14,
 36,
 320,
 49,
 315,
 21,
 3,
 13,
 162,
 24,
 20,
 36,
 200,
 73,
 14,
 25,
 393,
 201,
 317,
 413,
 69,
 5,
 53,
 68,
 54,
 65,
 143,
 49,
 463,
 455,
 173,
 3,
 339,
 54,
 150,
 22,
 477,
 137,
 391,
 163,
 371,
 38,
 245,
 425,
 204,
 471,
 441,
 137,
 190,
 25,
 319,
 454,
 126,
 29,
 171,
 95,
 76,
 213,
 486,
 111,
 142,
 226,
 57,
 430,
 54,
 7,
 183,
 226,
 205,
 386,
 133,
 13,
 494,
 449,
 172,
 54,


####In the position_maxVal output, it displays the index of the maximum value for each row. So for row 0, the maximum cosine value is column 27.

In [None]:
updated_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.000000,0.230578,0.292304,0.301875,0.292143,0.204113,0.182456,0.235894,0.230578,0.201514,...,0.217749,0.301875,0.322502,0.279350,0.230578,0.251100,0.175826,0.322502,0.229102,0.356499
1,0.230578,0.000000,0.328268,0.640684,0.646493,0.638058,0.407267,0.641915,1.000000,0.600439,...,0.456252,0.640684,0.365389,0.619409,1.000000,0.557298,0.150124,0.365389,0.626579,0.327191
2,0.292304,0.328268,0.000000,0.416659,0.350917,0.499912,0.348369,0.492883,0.328268,0.770148,...,0.223500,0.416659,0.691755,0.403472,0.328268,0.270508,0.210554,0.691755,0.429003,0.537318
3,0.301875,0.640684,0.416659,0.000000,0.558221,0.351065,0.272592,0.350677,0.640684,0.600464,...,0.230866,1.000000,0.437234,0.770321,0.640684,0.546939,0.214890,0.437234,0.341334,0.447735
4,0.292143,0.646493,0.350917,0.558221,0.000000,0.626725,0.484693,0.620456,0.646493,0.366460,...,0.483807,0.558221,0.373787,0.485214,0.646493,0.230189,0.197831,0.373787,0.603476,0.378563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.251100,0.557298,0.270508,0.546939,0.230189,0.216966,0.144690,0.211334,0.557298,0.492624,...,0.138499,0.546939,0.294854,0.531529,0.557298,0.000000,0.262886,0.294854,0.250367,0.280411
496,0.175826,0.150124,0.210554,0.214890,0.197831,0.186914,0.149883,0.179821,0.150124,0.178771,...,0.137595,0.214890,0.243909,0.175489,0.150124,0.262886,0.000000,0.243909,0.245247,0.272327
497,0.322502,0.365389,0.691755,0.437234,0.373787,0.560768,0.605859,0.619451,0.365389,0.452757,...,0.205085,0.437234,1.000000,0.416028,0.365389,0.294854,0.243909,0.000000,0.577205,0.562396
498,0.229102,0.626579,0.429003,0.341334,0.603476,0.663754,0.654717,0.758618,0.626579,0.384653,...,0.467285,0.341334,0.577205,0.353054,0.626579,0.250367,0.245247,0.577205,0.000000,0.392673


###Using brute force to find the most similar sentences for comparison
The idea is to iterate through the data row by row, and find the 5 highest cosine values. Once found, they are stored in the answers_all list, and that is done over the entire dataset. Print(answers_all) will show the entire output of the list.

In [None]:
used_index = set()
answers_all = []
for index, row in updated_df.iterrows():
  if index in used_index:
    continue
  sorted_index = np.argsort(row.values)[::-1]
  current_answer = [index]
  used_index.add(index)
  count = 0
  for i in sorted_index:
      if i not in used_index:
          used_index.add(i)
          current_answer.append(i)
          count += 1
          if count == 4:
              break
  answers_all.append(current_answer)

Once retrieved, we can then match our original DATA (not dataframe) to the results of the for loop and separate each entry with a dotted line for ease of use.

In [None]:
for answers_indexes in answers_all:
  answer_sentence = np.take(data, answers_indexes)
  print(answer_sentence)
  print('------------------------------------------------')

['<start> A man rows a boat against a night sky . <end>'
 '<start> A man stands in the middle of two woman dressed in brightly colored costumes . <end>'
 '<start> A man stands with her arms folded outside a travel agency . <end>'
 '<start> A man is doing a jump on bicycle in an indoor skate and bike center . <end>'
 '<start> A man sitting on a bench with his bicycle leaning against a railing . <end>']
------------------------------------------------
['<start> Two dogs running through a field . <end>'
 '<start> Two dogs are fighting and playing with each other while running through some grass . <end>'
 '<start> Two dogs are playing outside in a field . <end>'
 '<start> Two dogs are playing in the grass . <end>'
 '<start> Two dogs swimming in a pool . <end>']
------------------------------------------------
['<start> A small boy wearing a red helmet rides his bicycle down a patterned path . <end>'
 '<start> a small boy plays in the sprinkler . <end>'
 '<start> The small boy is running th

###Problems with this method
####The problem with using brute force to iterate over the data and find the highest values is that it iterates sequentially, so it starts with row 0 and works its way through the dataframe that way. The problem arises when we get further and further into the data. By the end of the rows, we have very few sentences to choose from, and therefore the sentences are not as simiar to the first groups