In [None]:
!wget https://ai-data.obs.ru-moscow-1.hc.sbercloud.ru/%D0%98%D0%98_%D1%84%D0%BE%D1%82%D0%BE%D0%B1%D0%B0%D0%BD%D0%BA_%D0%B4%D0%BB%D1%8F_%D0%BF%D0%BE%D0%B8%D1%81%D0%BA%D0%BE%D0%B2%D0%BE%D0%B3%D0%BE_%D0%BE%D1%82%D1%80%D1%8F%D0%B4%D0%B0.zip

In [None]:
%%capture
!unzip ИИ_фотобанк_для_поискового_отряда.zip

In [None]:
%%capture
!pip install ruclip==0.0.1

In [None]:
import torch
import ruclip

from PIL import Image
import os
from tqdm.notebook import tqdm
from collections import Counter
import shutil
from zipfile import ZipFile

import numpy as np
import pandas as pd
from humanize import naturalsize
from scipy.spatial.distance import cosine

import pickle
import numpy as np

In [None]:
model_name = "ruclip-vit-base-patch16-384"
model, processor = ruclip.load(model_name, device="cuda")

In [None]:
paths = ["Итоговый датасет/"]

In [None]:
def gen_batch(inputs, batch_size):
    batch_start = 0
    while batch_start < len(inputs):
        yield inputs[batch_start: batch_start + batch_size]
        batch_start += batch_size

In [None]:
results = []
batch_size = 128

with torch.no_grad():
    for path in tqdm(paths):
        data = [os.path.join(path, filename)
                for filename in os.listdir(path)]
        data.sort()
        batches = list(gen_batch(inputs=data, batch_size=batch_size))
        for batch in tqdm(batches):
            pil_images = []
            for idx, path2img in enumerate(batch):
                try:
                    image = Image.open(path2img)
                    pil_images.append(image)
                except:
                    del batch[idx]
            batch_torch = processor(images=pil_images,
                                    return_tensors='pt',
                                    padding=True)["pixel_values"].to("cuda")
            preds = model.encode_image(batch_torch)
            for idx, (path2img, embeddings) in enumerate(zip(batch, preds)):
                results.append([path2img, embeddings])

In [None]:
results = [(i, j.detach().cpu().numpy()) for i, j in results]

In [None]:
df = pd.DataFrame([(i, *j) for i, j in results], columns=["path"] + [f"embeddins_{i}" for i in range(results[0][1].shape[-1])])

In [None]:
df.head()

In [None]:
df.to_parquet(f"data_{model_name}.parquet")

In [None]:
df.to_feather(f"data_{model_name}.feather")

In [None]:
naturalsize(os.path.getsize(f"data_{model_name}.parquet"))

In [None]:
naturalsize(os.path.getsize(f"data_{model_name}.feather"))

In [None]:
!pip install faiss-gpu

In [None]:
embeddings = [i[1] for i in results]

In [None]:
with open("embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [None]:
embeddings = np.array([i[1].detach().cpu().numpy() for i in embeddings]).astype('float32')
# embeddings = np.array(embeddings).astype('float32')

In [None]:
embeddings.shape

In [None]:
d = embeddings.shape[-1]

In [None]:
np.random.seed(1234)             # make reproducible

In [None]:
xq = np.random.random((1, d)).astype('float32')

In [None]:
import faiss

res = faiss.StandardGpuResources()
index_flat = faiss.IndexFlatL2(d)
index = faiss.index_cpu_to_gpu(res, 0, index_flat)
index.add(embeddings)

index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [None]:
k = 5

In [None]:
%%time
D, I = index.search(xq, k)  

In [None]:
print(I)

In [None]:
t = range(len(embeddings))

In [None]:
%%time
sorted(t, key=lambda x: 1 - cosine(xq, embeddings[x]))[-5:]

In [None]:
1 - cosine(xq, embeddings[3357])

In [None]:
1 - cosine(xq, xb[8])

In [None]:
!pip install openpyxl catboost

In [None]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
target = pd.read_excel("./razm.xlsx")

data = pd.read_feather("./data_ruclip-vit-base-patch32-224.feather")
emb_shape = data.shape[-1] - 1
data["path"] = data["path"].apply(lambda x: x.split("/")[1])
data = data.rename({"path": "Названия"}, axis=1)

data = data.merge(target, on="Названия", how="left")

In [None]:
metrics = []
for column in target.columns[1:]:
    X_train, X_val, y_train, y_val = train_test_split(data[[f"embeddins_{i}" for i in range(emb_shape)]],
                                                      data[column],
                                                      test_size=0.1,
                                                      random_state=42,
                                                      shuffle=True)
    
    catboost_model = CatBoostClassifier(task_type='GPU',
                                        verbose=100,
                                        iterations=100,
                                        random_seed=42)
    
    catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    metrics.append([column, accuracy_score.__name__,
                    accuracy_score(y_val, catboost_model.predict(X_val))])
    
    print(*metrics[-1])

print()
print(metrics)

In [None]:
metrics

In [None]:
metrics