In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

pip install faiss-cpu


In [None]:
import pandas as pd
import torch
import torchvision.models as models
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
import requests
from io import BytesIO
import os
import numpy as np

import faiss

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


from tqdm import tqdm
tqdm.pandas()


In [None]:
EMBEDDING_SHAPE = 512
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
df = pd.read_csv('/content/drive/MyDrive/AAA/PriceAlchemists/amazon_product_description.csv')

In [None]:
df

In [None]:
def parse_price(price_str):
    if not isinstance(price_str, str):
        return np.nan
    cleaned = price_str.replace('$', '').replace(' ', '')
    if '-' in cleaned:
        try:
            low, high = cleaned.split('-')
            return (float(low) + float(high)) / 2
        except:
            return np.nan
    else:
        try:
            return float(cleaned)
        except:
            return np.nan

In [None]:
df['price'] = df['price'].apply(parse_price)

In [None]:
model = models.resnet50(pretrained=True)
model.eval()

model = torch.nn.Sequential(*list(model.children())[:-2])

model = nn.Sequential(
    model,
    nn.AdaptiveAvgPool2d((1, 1)),
    nn.Flatten(),
    nn.Linear(2048, EMBEDDING_SHAPE)
)

model = model.to(device)

In [None]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

In [None]:
def get_img_embedding(img_idx, model, transform):
  img_path = f'/content/drive/MyDrive/AAA/PriceAlchemists/bprateek.amazon_product_description/{img_idx}.jpg'
  img = Image.open(img_path).convert('RGB')
  img = transform(img).unsqueeze(0)
  img = img.to(device)
  with torch.no_grad():
      embedding = model(img).squeeze().cpu().numpy()
  return img

In [None]:
df['Embedding'] = df['img_idx'].progress_apply(lambda img_idx: get_img_embedding(img_idx, model, transform))

# Базовая регрессия стоимости с проверкой на валидации

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Embedding'].values, df['price'].values, test_size=0.2, random_state=42)

In [None]:
embedding_train = np.vstack(X_train)
embedding_test = np.vstack(X_test)

In [None]:
index = faiss.IndexFlatL2(EMBEDDING_SHAPE)
index.add(embedding_train)

In [None]:
distances, indices = index.search(embedding_test, k=10)
y_pred = []
for neighbor_idxs in indices:
    neighbor_prices = y_train[neighbor_idxs]
    pred_price = neighbor_prices.mean()
    y_pred.append(pred_price)
y_pred = np.array(y_pred)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mdape = np.median(np.abs((y_test - y_pred) / y_test)) * 100
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MAE:  {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²:   {r2:.4f}")
print(f"MdAPE: {mdape:.2f}%")
print(f"MAPE: {mape:.2f}%")

# Просмотр изображений на сходство

In [None]:
embedding_data = np.vstack(df['Embedding'].values)
index = faiss.IndexFlatL2(EMBEDDING_SHAPE)
index.add(embedding_data)

In [None]:
def show_nearest_neighbors(query_idx, df, index, k=5):
    """
    Визуализирует изображение с индексом `query_idx` и k ближайших к нему изображений.
    Показывает расстояния до каждого из соседей.

    Parameters:
        query_idx (int): индекс в датафрейме df.
        df (pd.DataFrame): датафрейм с колонками 'img_idx' и 'Embedding'.
        index (faiss.Index): FAISS-индекс.
        k (int): число ближайших соседей (включая сам запрос).
    """
    # Получаем вектор запроса
    query_embedding = df.iloc[query_idx]['Embedding'].reshape(1, -1)

    # Находим k ближайших
    distances, indices = index.search(query_embedding, k)

    # Визуализация
    plt.figure(figsize=(3 * k, 4))
    for i, (idx, dist) in enumerate(zip(indices[0], distances[0])):
        img_id = df.iloc[idx]['img_idx']
        img_path = f'/content/drive/MyDrive/AAA/PriceAlchemists/bprateek.amazon_product_description/{img_id}.jpg'
        try:
            img = Image.open(img_path).convert("RGB")
        except:
            print(f"Could not open image: {img_path}")
            continue
        plt.subplot(1, k, i + 1)
        plt.imshow(img)
        plt.axis('off')
        if i == 0:
            plt.title(f'Запрос\nidx={idx}')
        else:
            plt.title(f'#{i}\nD={dist:.2f}')
    plt.tight_layout()
    plt.show()


In [None]:
show_nearest_neighbors(34, df, index, k=6)