In [1]:
# !pip install faiss-gpu
# # !pip install faiss-cpu
# !pip install sentence_transformers

In [2]:
import os
import re
import gc
import sys
import multiprocessing
import numpy as np
import pandas as pd
import faiss
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
from copy import deepcopy
import torch
import blingfire as bf
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import tokenizers
import transformers

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from pathlib import Path
from glob import glob
import warnings

warnings.simplefilter('ignore')
model = "model/gte-small"
tokenizer = AutoTokenizer.from_pretrained(model)
device = torch.device('cuda:1') if torch.cuda.device_count() > 1 else torch.device('cuda:0')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0


In [3]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # self.config.hidden_dropout = 0.
            # self.config.hidden_dropout_prob = 0.
            # self.config.attention_dropout = 0.
            # self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)

        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        # if self.cfg.gradient_checkpointing:
        #     self.model.gradient_checkpointing_enable

        self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        # feature = F.normalize(feature, p=2, dim=1)
        return feature


In [None]:
MODEL_NAME = "output_simcse_model"
model = CustomModel(cfg=None, config_path=MODEL_NAME + '/config.pth', pretrained=False)
state = torch.load(MODEL_NAME + '/model-gte-small_fold0_best.pth', map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
model.to(device)

In [4]:
model_name = "model/e5-base-v2/"
sentence_transformer = SentenceTransformer(model_name)
parquet_folder = ""

file_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'number', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

for idx, filename in enumerate(file_names):
    if (idx + 1) >= 1:
        document_embeddings = []

        print(f"Processing file_id: {idx + 1} - file_name: {filename}.parquet ......")

        parquet_path = os.path.join(parquet_folder, f"{filename}.parquet")
        df = pd.read_parquet(parquet_path)

        print(df.columns)
        print("Sample text: ", df.iloc[0]["text"])

        sentences = df["text"].tolist()
        embeddings = sentence_transformer.encode(sentences, normalize_embeddings=True)
        document_embeddings.extend(embeddings)

        del df

        document_embeddings = np.array(document_embeddings).astype("float32")
        index = faiss.IndexFlatIP(document_embeddings.shape[1])
        index.add(document_embeddings)
        faiss_index_path = f"/kaggle/working/wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
        faiss.write_index(index, faiss_index_path)


        print(f"Faiss index saved to '{faiss_index_path}'")

Downloading (…)2e6d8/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)9e0ce2e6d8/README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading (…)0ce2e6d8/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/66.8M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)2e6d8/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)9e0ce2e6d8/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)ce2e6d8/modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Processing file_id: 1 - file_name: a.parquet ......
Index(['id', 'title', 'text', 'categories'], dtype='object')
Sample text:  A & B High Performance Firearms was a competition pistol manufacturer. Products included the "Limited Class" and "Open Class" semi-automatic pistols, both available in .40 S&W; and .45 ACP. A & B sold directly to consumers. ==References== ==External links== Category:Defunct firearms manufacturers Category:Defunct manufacturing companies based in California


Batches:   0%|          | 0/13836 [00:00<?, ?it/s]

Faiss index saved to '/kaggle/working/wikipedia_embeddings_collection_1_a.index'
Processing file_id: 2 - file_name: b.parquet ......
Index(['id', 'title', 'text', 'categories'], dtype='object')
Sample text:  B & B Hospital (Baidya and Banskota Hospital) is a private hospital with the goal to provide health services to the community of Nepal founded in 1997. The hospital was established in 1977 in order to provide an extensive and affordable service to the community. B&B; was established by Dr. Jagdish Lal Baidya and Dr. Ashok K. Banskota. It is located over 2.26 acres and includes an educational wing called B&B; Medical Institute. B&B; Hospital's goal is to provide efficient healthcare in the country with many departments such as orthopedics, general surgery and urology, general medicine, plastic/cosmetic & maxillofacial surgery, gynecology and obstetrics, neuroscience, pediatrics, otorhinolaryngology, cardiology, oncology, cardiothoracic & vascular surgery, dental, psychiatry, dermato

Batches:   0%|          | 0/10206 [00:00<?, ?it/s]

Faiss index saved to '/kaggle/working/wikipedia_embeddings_collection_2_b.index'
Processing file_id: 3 - file_name: c.parquet ......


In [None]:
# index_folder1 = "/kaggle/input/wikipedia-embeddings"
# index_folder2 = "/kaggle/input/wikipedia-faiss-index"

# file_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'number', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# merged_index = faiss.IndexFlatL2(384)
# for idx, filename in enumerate(file_names):
#     if (idx + 1) >= 7:
#         break
    
#     if (idx + 1) >= 12 and (idx + 1) <= 20:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder2, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     else:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder1, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     if (idx + 1) == 6:
#         merged_index_path = "/kaggle/working/merged_1.index"
#         faiss.write_index(merged_index, merged_index_path)

#         print(f"Merged index saved to '{merged_index_path}'")
        
#         del merged_index

        
# merged_index = faiss.IndexFlatL2(384)
# for idx, filename in enumerate(file_names):
#     if (idx + 1) <= 6:
#         continue
        
#     if (idx + 1) == 13:
#         break
    
#     if (idx + 1) >= 12 and (idx + 1) <= 20:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder2, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     else:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder1, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     if (idx + 1) == 12:
#         merged_index_path = "/kaggle/working/merged_2.index"
#         faiss.write_index(merged_index, merged_index_path)

#         print(f"Merged index saved to '{merged_index_path}'")
        
#         del merged_index

        
# merged_index = faiss.IndexFlatL2(384)
# for idx, filename in enumerate(file_names):
#     if (idx + 1) <= 12:
#         continue
        
#     if (idx + 1) == 20:
#         break
    
#     if (idx + 1) >= 12 and (idx + 1) <= 20:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder2, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     else:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder1, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     if (idx + 1) == 19:
#         merged_index_path = "/kaggle/working/merged_3.index"
#         faiss.write_index(merged_index, merged_index_path)

#         print(f"Merged index saved to '{merged_index_path}'")
        
#         del merged_index
        
# merged_index = faiss.IndexFlatL2(384)
# for idx, filename in enumerate(file_names):
#     if (idx + 1) <= 19:
#         continue
    
#     if (idx + 1) >= 12 and (idx + 1) <= 20:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder2, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     else:
#         indexname = f"wikipedia_embeddings_collection_{idx + 1}_{filename}.index"
#         print(f"Merge file {idx + 1} - {indexname}")
#         index = faiss.read_index(os.path.join(index_folder1, indexname))

#         num_vectors = index.ntotal
#         for i in range(num_vectors):
#             vec = index.reconstruct(i).reshape(-1, 384)
#             vec = np.array(vec).astype("float32")
#             merged_index.add(vec)
            
#     if (idx + 1) == 28:
#         merged_index_path = "/kaggle/working/merged_4.index"
#         faiss.write_index(merged_index, merged_index_path)

#         print(f"Merged index saved to '{merged_index_path}'")
        
#         del merged_index

In [None]:
# merged_index = faiss.IndexFlatL2(384)
# # merged_index = faiss.read_index("/kaggle/input/wikipedia-embeddings/merged_1.index")
# index_folder = "/kaggle/input/wikipedia-embeddings"

# for idx, indexname in enumerate(os.listdir(index_folder)):
#     print(f"Merge file {idx + 1} - {indexname}")
#     index = faiss.read_index(os.path.join(index_folder, indexname))

#     num_vectors = index.ntotal
#     for i in range(num_vectors):
#         vec = index.reconstruct(i).reshape(-1, 384)
#         vec = np.array(vec).astype("float32")
#         merged_index.add(vec)

#     del index

# merged_index_path = "/kaggle/working/merged.index"
# faiss.write_index(merged_index, merged_index_path)

# print(f"Merged index saved to '{merged_index_path}'")