In [15]:
# !conda create --name faiss-tutorial python=3.12
# !activate faiss-tutorial
# !pip install jupyter
!python --version
print('---')
# !pip list

Python 3.12.3
---


In [16]:
# !pip install pandas

In [17]:
# !pip show pandas

In [8]:
import requests
from io import StringIO
import pandas as pd

In [9]:
res = 'SICK_train.txt'
# create dataframe
data = pd.read_csv((res), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [13]:
# pandas.core.frame.DataFrame
type(data)

pandas.core.frame.DataFrame

In [14]:
# pandas.core.series.Series
type(data['sentence_A'])

pandas.core.series.Series

In [None]:
data['sentence_A']

In [18]:
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [19]:
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)  # merge them
len(set(sentences))  # together we have ~4.5K unique sentences

4802

In [20]:
data_file_names = [
    'MSRpar.train.tsv',
    'MSRpar.test.tsv',
    'OnWN_2012.test.tsv',
    'OnWN_2013.test.tsv',
    'OnWN_2014.test.tsv',
    'images_2014.test.tsv',
    'images_2015.test.tsv'
]

In [21]:
# each of these dataset have the same structure, so we loop through each creating our sentences data
for file_name in data_file_names:
    # extract to dataframe
    data = pd.read_csv(file_name, sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

In [22]:
len(set(sentences))

14505

In [23]:
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [27]:
# !pip install sentence_transformers

In [31]:
from sentence_transformers import SentenceTransformer
# initialize sentence transformer model
# model = SentenceTransformer('bert-base-nli-mean-tokens')
model = SentenceTransformer('D:/models/bert-base-nli-mean-tokens')

In [32]:
%%time
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

CPU times: total: 15min 45s
Wall time: 3min 1s


(14504, 768)

In [35]:
# !pip install faiss-cpu

In [36]:
import faiss

In [37]:
d = sentence_embeddings.shape[1]
d

768

In [38]:
index = faiss.IndexFlatL2(d)

In [39]:
index.is_trained

True

In [40]:
index.add(sentence_embeddings)

In [41]:
index.ntotal

14504

In [42]:
k = 4
xq = model.encode(["Someone sprints with a football"])

In [43]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[  289  1762 10124  7291]]
CPU times: total: 0 ns
Wall time: 8.68 ms


In [44]:
print(D)

[[54.623802 54.853516 57.356255 57.905872]]


In [67]:
print(type(I[0]))
print(I[0])

<class 'numpy.ndarray'>
[  289  1762 10124  7291]


In [58]:
results = [sentences[i] for i in I[0]]
results

['A group of football players is running in the field',
 'A group of people playing football is running in the field',
 'Two groups of people are playing football',
 'A person playing football is running past an official carrying a football']

In [66]:
sentence_embeddings[I[0]][:]

array([[ 0.01627037,  0.22325917, -0.15037376, ...,  0.18459873,
        -0.89362496,  0.24901016],
       [-0.0358859 ,  0.07184523, -0.12571451, ...,  0.24695738,
        -0.8649769 ,  0.3482986 ],
       [ 0.22949532,  0.21166386, -0.1031185 , ...,  0.5247168 ,
        -0.5153686 ,  0.44770485],
       [-0.09046145,  0.3252644 , -0.21553473, ...,  0.10242786,
        -0.7339727 ,  0.43178767]], dtype=float32)

# Speed