# Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
pd.get_option("display.max_columns")
pd.set_option('display.max_columns', 300)
pd.get_option("display.max_rows")
pd.set_option('display.max_rows', 300)

import matplotlib.pyplot as plt
%matplotlib inline

import os
from os.path import join as opj
import gc

INPUT_PATH = '../../input/feedback-prize-english-language-learning/'

In [2]:
train_df = pd.read_csv(opj(INPUT_PATH, 'train.csv'))
test_df = pd.read_csv(opj(INPUT_PATH, 'test.csv'))
sub_df = pd.read_csv(opj(INPUT_PATH, 'sample_submission.csv'))

print('train_df.shape = ', train_df.shape)
print('test_df.shape = ', test_df.shape)
print('sub_df.shape = ', sub_df.shape)

train_df.shape =  (3911, 8)
test_df.shape =  (3, 2)
sub_df.shape =  (3, 7)


# Search Similar Text by Sentence Bert

In [25]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')

In [11]:
fb1_train_df = pd.read_csv('../../input/feedback-prize-2021/train.csv')
unique_ids_fb1 = sorted(fb1_train_df['id'].unique())
print('len(unique_ids_fb1) = ', len(unique_ids_fb1))

fb3_only_df = train_df[~train_df['text_id'].isin(fb1_train_df['id'])].reset_index(drop=True)
fb1_only_df = fb1_train_df[~fb1_train_df['id'].isin(train_df['text_id'])].reset_index(drop=True)
print('fb3_only_df.shape = ', fb3_only_df.shape)
print('fb1_only_df.shape = ', fb1_only_df.shape)

len(unique_ids_fb1) =  15594
fb3_only_df.shape =  (3459, 8)
fb1_only_df.shape =  (139860, 8)


In [33]:
from tqdm import tqdm

fb1_unique_ids = sorted(fb1_only_df['id'].unique())
fb1_text_base_path = '../../input/feedback-prize-2021/train/'

fb1_embed_list = []
for text_id in tqdm(fb1_unique_ids):
    text_path = opj(fb1_text_base_path, text_id+'.txt')
    text = open(text_path).read()
    embed = model.encode(text)
    fb1_embed_list.append(embed)

100%|██████████| 15142/15142 [02:29<00:00, 101.62it/s]


In [37]:
fb1_embeds = np.vstack(fb1_embed_list)
fb1_embeds.shape

(15142, 768)

In [35]:
fb3_embed_list = []
for text in tqdm(fb3_only_df['full_text'].values):
    embed = model.encode(text)
    fb3_embed_list.append(embed)

100%|██████████| 3459/3459 [00:33<00:00, 102.63it/s]


In [36]:
fb3_embeds = np.vstack(fb3_embed_list)
fb3_embeds.shape

(3459, 768)

In [43]:
sim_matrix = (fb1_embeds/np.linalg.norm(fb1_embeds,axis=1,keepdims=True))@(fb3_embeds/np.linalg.norm(fb3_embeds,axis=1,keepdims=True)).T

In [44]:
sim_matrix.shape

(15142, 3459)

In [53]:
for th in [0.5,0.6,0.7,0.8,0.9,0.95,0.99]:
    print(th, (sim_matrix.max(axis=1)>th).sum())

0.5 15142
0.6 15130
0.7 14646
0.8 10621
0.9 3351
0.95 55
0.99 1


In [61]:
similar_data_ids_list = np.array(fb1_unique_ids)[sim_matrix.max(axis=1)>0.9]

In [64]:
res = []
for text_id in tqdm(similar_data_ids_list):
    text_path = opj(fb1_text_base_path, text_id+'.txt')
    text = open(text_path).read()
    res.append([text_id, text])

100%|██████████| 3351/3351 [00:00<00:00, 34786.44it/s]


In [68]:
similar_df = pd.DataFrame(res, columns=['id','full_text'])

import os
os.makedirs('result')
similar_df.to_csv('result/similar_df.csv', index=False)

In [69]:
similar_df.head()

Unnamed: 0,id,full_text
0,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...
1,0056F3D261D5,Millions of students from various cultures and...
2,005D28D3FEC2,"Dear principal,\n\nI think that it is wrong fo..."
3,007812CC14B2,School isn't the funneist things that kids wan...
4,008015604AA0,Distance learning provides an opportunity for ...
