In [None]:
# Сходство картинки с текстом

# 0.49416
import os
import pandas as pd
from PIL import Image
import torch
from torch.nn import functional as F
from torchvision import transforms, models
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# ----------------------------
# Конфигурация путей
# ----------------------------
TRAIN_TSV = "train_df.tsv"
TEST_TSV = "test_df.tsv"
TRAIN_IMG_DIR = "train"
TEST_IMG_DIR = "test"
OUTPUT_FILE = "submissionx2.tsv"

# ----------------------------
# 1. Загрузка данных
# ----------------------------
train_df = pd.read_csv(TRAIN_TSV, sep='\t')
test_df = pd.read_csv(TEST_TSV, sep='\t')

# ----------------------------
# 2. Предобработка изображений
# ----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

# Загрузка предобученной ResNet (без головы)
resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])  # Убираем последний слой
resnet.eval()
resnet.to(device)

preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def extract_img_features(img_path):
    try:
        img = Image.open(img_path).convert("RGB")
        img_tensor = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = resnet(img_tensor)
        return features.cpu().squeeze().numpy()
    except Exception as e:
        # print(f"Ошибка при обработке {img_path}: {e}")
        return None

# ----------------------------
# 3. Предобработка текста
# ----------------------------
text_model = SentenceTransformer('cointegrated/rubert-tiny2') # или для множества языков: 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
text_model.to(device)

def encode_texts(texts):
    return text_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

# ----------------------------
# 4. Извлечение признаков
# ----------------------------
def build_features(df, img_dir):
    img_paths = [os.path.join(img_dir, fname.split(":")[1]) for fname in df['filename']]
    img_feats = []
    for path in img_paths:
        feat = extract_img_features(path)
        if feat is None:
            feat = [0] * 2048  # заглушка, если изображение не загружено
        img_feats.append(feat)
    text_feats = encode_texts(df['text'].tolist())
    # Объединяем признаки
    import numpy as np
    combined = np.concatenate([img_feats, text_feats], axis=1)
    return combined

# from tqdm import tqdm
# def build_features(df, img_dir):
#     img_paths = [os.path.join(img_dir, fname.split(":")[1]) for fname in df['filename']]
#     img_feats = []
#     for path in tqdm(img_paths, desc="Извлечение признаков изображений"):
#         feat = extract_img_features(path)
#         if feat is None:
#             feat = [0] * 2048  # заглушка
#         img_feats.append(feat)
#     text_feats = encode_texts(df['text'].tolist())
#     import numpy as np
#     combined = np.concatenate([img_feats, text_feats], axis=1)
#     return combined

print("Извлечение признаков для train...")
X_train = build_features(train_df, TRAIN_IMG_DIR)
y_train = train_df['mark'].values

print("Извлечение признаков для test...")
X_test = build_features(test_df, TEST_IMG_DIR)

# ----------------------------
# 5. Обучение модели
# ----------------------------
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train_enc)

# ----------------------------
# 6. Предсказание
# ----------------------------
y_pred_enc = clf.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)

# ----------------------------
# 7. Сохранение результата
# ----------------------------
result = pd.DataFrame({
    'filename': test_df['filename'],
    'text': test_df['text'],
    'mark': y_pred
})
result.to_csv(OUTPUT_FILE, index=False, sep='\t')
print(f"Результат сохранён в {OUTPUT_FILE}")

In [None]:
# Классификация изображений

# 0.9692
import torch
from torchvision import transforms
# from torchvision.datasets import CIFAR10
# # train_dataset = CIFAR10(root="./data", train=True, download=True, transform=transform)
# # val_dataset = CIFAR10(root="./data", train=False, download=True, transform=transform)
from torch import nn
from torchvision.models import resnet50, ResNet50_Weights
from transformers import Trainer, TrainingArguments
from transformers.modeling_outputs import ImageClassifierOutput
from sklearn.metrics import accuracy_score



# 1. Подготовка данных (Когда каждый класс находится в одной папке)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

from torchvision.datasets import ImageFolder

train_dataset = ImageFolder(root="cifar10/train", transform=transform)
val_dataset = ImageFolder(root="cifar10/test", transform=transform)



# 1. Подготовка данных (Когда все в одной папке + file.csv)
# import os
# import pandas as pd
# from PIL import Image
# from torch.utils.data import Dataset

# class CSVDataset(Dataset):
#     def __init__(self, csv_file, img_root, transform=None):
#         self.df = pd.read_csv(csv_file)
#         self.img_root = img_root
#         self.transform = transform

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         img_path = os.path.join(self.img_root, row['filename'])
#         image = Image.open(img_path).convert("RGB")
#         label = int(row['label'])

#         if self.transform:
#             image = self.transform(image)
#         return image, label

# # Использование:
# train_dataset = CSVDataset(
#     csv_file="train_labels.csv",
#     img_root="images/",
#     transform=transform
# )



# 2. Модель
model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
model.fc = nn.Linear(model.fc.in_features, 10)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class WrappedResNet50(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, pixel_values=None, labels=None, **kwargs):
        logits = self.model(pixel_values)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return ImageClassifierOutput(loss=loss, logits=logits)

model_wrapped = WrappedResNet50(model)

# 3. Collate function
def collate_fn(examples):
    pixel_values = torch.stack([example[0] for example in examples])
    labels = torch.tensor([example[1] for example in examples], dtype=torch.long)
    return {"pixel_values": pixel_values, "labels": labels}

# 4. Метрика — обычный способ
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# 5. Аргументы и Trainer
training_args = TrainingArguments(
    output_dir="./cifar10-resnet50",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # evaluation_strategy="epoch",
    # save_strategy="epoch",
    logging_steps=1000,
    num_train_epochs=3,
    # fp16=torch.cuda.is_available(),
    save_total_limit=2,
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    # greater_is_better=True,
    report_to="none",
)

trainer = Trainer(
    model=model_wrapped,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()


# Оценка модели

from torchvision.datasets import ImageFolder

# Трансформации — точно такие же, как при обучении!
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Загружаем тестовые данные из папки "test"
test_dataset = ImageFolder(root="cifar10/test", transform=test_transform)

# Убедимся, что порядок классов совпадает с CIFAR-10
# CIFAR-10 классы в порядке: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
test_dataset.class_to_idx = {cls: idx for idx, cls in enumerate(cifar10_classes)}

# DataLoader
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=2
)

# # Accuracy
# model_wrapped.eval()
# all_preds = []
# all_labels = []

# with torch.no_grad():
#     for batch in test_loader:
#         pixel_values = batch[0].to(model_wrapped.model.fc.weight.device)
#         labels = batch[1]
#         outputs = model_wrapped(pixel_values=pixel_values)
#         preds = outputs.logits.argmax(dim=-1).cpu()
#         all_preds.append(preds)
#         all_labels.append(labels)

# all_preds = torch.cat(all_preds)
# all_labels = torch.cat(all_labels)

# accuracy = (all_preds == all_labels).float().mean().item()
# print(f"Accuracy на папке test: {accuracy:.4f}")

# from pathlib import Path
# import pandas as pd

# filenames = [Path(p).name for p, _ in test_dataset.imgs]
# # filenames = [Path(p).relative_to("cifar10/test") for p, _ in test_dataset.imgs]

# df = pd.DataFrame({"filename": filenames, "label": all_preds})
# df.to_csv("predictions.csv", index=False)

from pathlib import Path
import pandas as pd
import torch

model_wrapped.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        pixel_values = batch[0].to(model_wrapped.model.fc.weight.device)
        outputs = model_wrapped(pixel_values=pixel_values)
        preds = outputs.logits.argmax(dim=-1).cpu()
        all_preds.append(preds)

all_preds = torch.cat(all_preds)

# Получаем имена файлов (можно использовать .name или относительный путь)
filenames = [Path(p).name for p, _ in test_dataset.imgs]

# Сохраняем в CSV
df = pd.DataFrame({"filename": filenames, "label": all_preds.tolist()})
df.to_csv("predictions.csv", index=False)

In [None]:
# RAG система

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time

name_model = "Qwen/Qwen3-4B"
name_encoder = 'all-MiniLM-L6-v2'

print("Загружаем модели...")
encoder = SentenceTransformer(name_encoder)
tokenizer = AutoTokenizer.from_pretrained(name_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    name_model,
    torch_dtype="auto",#torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True
)
model.eval()


# === 2. База знаний ===
documents = []
with open('facts.txt', 'r', encoding='utf-8') as file:
    documents = file.readlines()


# === 3. FAISS индекс ===
doc_embeddings = encoder.encode(documents, convert_to_numpy=True)
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

# === 4. Основная функция ===
def rag_answer_clean(query: str, top_k: int = 3, sim_threshold: float = 0.4) -> str:
    start = time.time()
    # Поиск
    query_emb = encoder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k)
    # Преобразуем L2-дистанцию в схожесть (меньше дистанция — выше схожесть)
    min_dist = distances[0][0]
    similarity = 1 / (1 + min_dist)
    if similarity < sim_threshold:
        return "Информация по запросу не найдена."
    retrieved_docs = [documents[i] for i in indices[0]]
    context = "\n".join(retrieved_docs)
    # Формируем ЧЁТКИЙ промпт
    prompt = f"""Ответь кратко.
Контекст:
{context}
Вопрос: {query}
Ответ:"""
    # Генерация
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():  # ускоряет и экономит память
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,      # только сколько нужно
            do_sample=False,        # детерминировано = быстрее и стабильнее
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
        )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Извлекаем только часть после "Ответ:"
    if "Ответ:" in full_output:
        answer = full_output.split("Ответ:", 1)[1].strip()
    else:
        answer = full_output[len(prompt):].strip()
    # Убираем возможный "Вопрос:" в ответе
    if "\nВопрос:" in answer:
        answer = answer.split("\nВопрос:")[0].strip()
    print(f"Время: {time.time() - start:.2f} сек")
    return answer

q = "FX_company какие занимает места?"#"Капитализация компании Яндекс"
otvet = rag_answer_clean(q)
print("Вопрос:", q)
print("Ответ:", otvet.split("\n")[0])

In [None]:
# Предсказание целевой метки с помощью признаков и картинок:
# 0.2848454177769526 - 0.778

# 0.2848454177769526 - 0.778

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from catboost import CatBoostRegressor

# -----------------------------
# 1. Загрузка и подготовка числовых признаков
# -----------------------------
# Числовые признаки (НЕ категориальные!)
NUM_FEATURES = ['mileage', 'latitude', 'longitude', 'crashes_count', 'doors_number']

# Категориальные признаки (оставляем только действительно категориальные)
CAT_FEATURES = ['equipment', 'body_type', 'drive_type', 'engine_type', 'color', 'pts',
                'audiosistema', 'diski', 'electropodemniki', 'fary', 'salon',
                'upravlenie_klimatom', 'usilitel_rul', 'steering_wheel', 'owners_count']

# Мультизначные признаки — обработаем отдельно
MULTI_FEATURES = [
    'aktivnaya_bezopasnost_mult', 'audiosistema_mult', 'shini_i_diski_mult',
    'electroprivod_mult', 'fary_mult', 'multimedia_navigacia_mult',
    'obogrev_mult', 'pamyat_nastroek_mult', 'podushki_bezopasnosti_mult',
    'pomosh_pri_vozhdenii_mult', 'protivoygonnaya_sistema_mult', 'salon_mult',
    'upravlenie_klimatom_mult'
]

# -----------------------------
# 2. Обработка числовых признаков
# -----------------------------
def preprocess_numeric(df):
    df = df.copy()
    # crashes_count и doors_number — изначально числа, но могут быть строками
    for col in ['crashes_count', 'doors_number']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
    
    # owners_count: "> 3" → 4 или 999
    df['owners_count'] = df['owners_count'].replace({'> 3': '4'}).astype(str)
    
    # Числовые признаки — оставляем как числа!
    for col in NUM_FEATURES:
        if col != 'owners_count':  # он будет категориальным
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(df[col].median())
    return df

train_df = pd.read_parquet("train_dataset.parquet")#[0:7000]
test_df = pd.read_parquet("test_dataset.parquet")#[0:2500]

# Добавляем путь к изображениям
train_df["img_path"] = train_df["ID"].apply(lambda x: f"train_images/{x}_0.jpg")
test_df["img_path"] = test_df["ID"].apply(lambda x: f"test_images/{x}_0.jpg")

train_df = preprocess_numeric(train_df)
test_df = preprocess_numeric(test_df)

# -----------------------------
# 3. Обработка мультипризнаков: создадим бинарные флаги для частых значений
# -----------------------------
def extract_top_multilabels(train_df, test_df, multi_cols, top_n=10):
    all_features = {}
    for col in multi_cols:
        # Собираем все уникальные ненулевые значения
        values = []
        for val in train_df[col]:
            if isinstance(val, list):
                values.extend([v for v in val if v is not None and v != 'None'])
        # Топ-N самых частых
        top_vals = pd.Series(values).value_counts().head(top_n).index.tolist()
        all_features[col] = top_vals
        
        # Создаем бинарные признаки
        for feat in top_vals:
            train_df[f"{col}_{feat}"] = train_df[col].apply(
                lambda x: 1 if isinstance(x, list) and feat in x else 0
            )
            test_df[f"{col}_{feat}"] = test_df[col].apply(
                lambda x: 1 if isinstance(x, list) and feat in x else 0
            )
    
    # Удаляем исходные мульти-столбцы
    train_df = train_df.drop(columns=multi_cols)
    test_df = test_df.drop(columns=multi_cols)
    
    return train_df, test_df

train_df, test_df = extract_top_multilabels(train_df, test_df, MULTI_FEATURES, top_n=8)

# Обновим список категориальных признаков (новые мульти-флаги — числовые!)
# А owners_count теперь тоже категориальный (оставляем как строку)
CAT_FEATURES = [col for col in CAT_FEATURES if col != 'owners_count'] + ['owners_count']

# -----------------------------
# 4. Обработка категориальных признаков: замена None на "MISSING"
# -----------------------------
def fill_cat_missing(df, cat_cols):
    df = df.copy()
    for col in cat_cols:
        df[col] = df[col].fillna("MISSING").astype(str)
        # Заменяем 'None' (как строка) тоже на "MISSING"
        df[col] = df[col].replace({"None": "MISSING", "nan": "MISSING"})
    return df

train_df = fill_cat_missing(train_df, CAT_FEATURES)
test_df = fill_cat_missing(test_df, CAT_FEATURES)

# -----------------------------
# 5. Загрузка эмбеддингов (без изменений)
# -----------------------------
# EMBEDDINGS_DIR = "embeddings"
# train_emb_path = os.path.join(EMBEDDINGS_DIR, "train_embeddings.npy")
# test_emb_path = os.path.join(EMBEDDINGS_DIR, "test_embeddings.npy")

# train_features = np.load(train_emb_path)
# test_features = np.load(test_emb_path)

import torch
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

# Загружаем ResNet18 без последнего полносвязного слоя
model = models.resnet18(weights="IMAGENET1K_V1")
model = torch.nn.Sequential(*list(model.children())[:-1])  # удаляем последний слой (1000-классовый)
model = model.to(device)
model.eval()

# Трансформации (как при обучении ImageNet)
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_features(img_path):
    img = Image.open(img_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img)
    return features.squeeze().cpu().numpy()  # (512,)

# # Для train
# print("Extracting train features...")
# train_features = np.array([extract_features(p) for p in train_df["img_path"]])

# # Для test
# print("Extracting test features...")
# test_features = np.array([extract_features(p) for p in test_df["img_path"]])

import os
import numpy as np
from tqdm import tqdm

EMBEDDINGS_DIR = "embeddings"
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

train_emb_path = os.path.join(EMBEDDINGS_DIR, "train_embeddings.npy")
test_emb_path = os.path.join(EMBEDDINGS_DIR, "test_embeddings.npy")

# --- Train embeddings ---
if os.path.exists(train_emb_path):
    print("Loading cached train embeddings...")
    train_features = np.load(train_emb_path)
else:
    print("Extracting train features...")
    train_features = np.array([extract_features(p) for p in tqdm(train_df["img_path"])])
    np.save(train_emb_path, train_features)
    print(f"Train embeddings saved to {train_emb_path}")

# --- Test embeddings ---
if os.path.exists(test_emb_path):
    print("Loading cached test embeddings...")
    test_features = np.load(test_emb_path)
else:
    print("Extracting test features...")
    test_features = np.array([extract_features(p) for p in tqdm(test_df["img_path"])])
    np.save(test_emb_path, test_features)
    print(f"Test embeddings saved to {test_emb_path}")

# -----------------------------
# 6. Сборка финального датасета
# -----------------------------
# Числовые эмбеддинги
num_feature_names = [f'img_feat_{i}' for i in range(train_features.shape[1])]
train_num_df = pd.DataFrame(train_features, columns=num_feature_names)
test_num_df = pd.DataFrame(test_features, columns=num_feature_names)

# Числовые признаки из данных
train_num_meta = train_df[NUM_FEATURES].copy()
test_num_meta = test_df[NUM_FEATURES].copy()

# Категориальные признаки
train_cat = train_df[CAT_FEATURES].copy()
test_cat = test_df[CAT_FEATURES].copy()

# Все фичи: эмбеддинги + числовые мета + категориальные + мульти-флаги (уже в df)
X_train = pd.concat([train_num_df, train_num_meta, train_cat], axis=1)
X_test = pd.concat([test_num_df, test_num_meta, test_cat], axis=1)

# Проверим, что все числовые признаки — числа, а категориальные — строки
assert X_train[NUM_FEATURES].dtypes.apply(lambda x: np.issubdtype(x, np.number)).all()
assert X_train[CAT_FEATURES].dtypes.apply(lambda x: x == 'object').all()

# -----------------------------
# 7. Обучение CatBoost с валидацией
# -----------------------------
y_train = train_df["price_TARGET"].values

# Разделим train на train/val для ранней остановки
from sklearn.model_selection import train_test_split
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=6,
    loss_function='MAE',
    eval_metric='MAE',
    cat_features=CAT_FEATURES,
    early_stopping_rounds=100,
    verbose=100,
    random_seed=42,
    use_best_model=True
)

cat_model.fit(
    X_train_split, y_train_split,
    eval_set=(X_val, y_val),
    plot=False
)

# -----------------------------
# 8. Предсказание и сохранение
# -----------------------------
preds = cat_model.predict(X_test)

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "target": preds
})
submission.to_csv("submissionV17_mape.csv", index=False)

In [None]:
# Предсказание целевой метки с помощью признаков и картинок (худшая версия):
# 0.307417972160942 - 0.765

import pandas as pd
import os

train_df = pd.read_parquet("train_dataset.parquet")#[0:7000]
test_df = pd.read_parquet("test_dataset.parquet")#[0:2500]

# Добавляем путь к изображениям
train_df["img_path"] = train_df["ID"].apply(lambda x: f"train_images/{x}_0.jpg")
test_df["img_path"] = test_df["ID"].apply(lambda x: f"test_images/{x}_0.jpg")
import torch
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

# Загружаем ResNet18 без последнего полносвязного слоя
model = models.resnet18(weights="IMAGENET1K_V1")
model = torch.nn.Sequential(*list(model.children())[:-1])  # удаляем последний слой (1000-классовый)
model = model.to(device)
model.eval()

# Трансформации (как при обучении ImageNet)
transform = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_features(img_path):
    img = Image.open(img_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(img)
    return features.squeeze().cpu().numpy()  # (512,)
# # Для train
# print("Extracting train features...")
# train_features = np.array([extract_features(p) for p in train_df["img_path"]])

# # Для test
# print("Extracting test features...")
# test_features = np.array([extract_features(p) for p in test_df["img_path"]])

import os
import numpy as np
from tqdm import tqdm

EMBEDDINGS_DIR = "embeddings"
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

train_emb_path = os.path.join(EMBEDDINGS_DIR, "train_embeddings.npy")
test_emb_path = os.path.join(EMBEDDINGS_DIR, "test_embeddings.npy")

# --- Train embeddings ---
if os.path.exists(train_emb_path):
    print("Loading cached train embeddings...")
    train_features = np.load(train_emb_path)
else:
    print("Extracting train features...")
    train_features = np.array([extract_features(p) for p in tqdm(train_df["img_path"])])
    np.save(train_emb_path, train_features)
    print(f"Train embeddings saved to {train_emb_path}")

# --- Test embeddings ---
if os.path.exists(test_emb_path):
    print("Loading cached test embeddings...")
    test_features = np.load(test_emb_path)
else:
    print("Extracting test features...")
    test_features = np.array([extract_features(p) for p in tqdm(test_df["img_path"])])
    np.save(test_emb_path, test_features)
    print(f"Test embeddings saved to {test_emb_path}")
meta_features = ['equipment', 'body_type', 'drive_type', 'engine_type',
       'doors_number', 'color', 'pts', 'audiosistema', 'diski',
       'electropodemniki', 'fary', 'salon', 'upravlenie_klimatom',
       'usilitel_rul', 'steering_wheel', 'crashes_count', 'owners_count',
       'mileage', 'latitude', 'longitude', 'aktivnaya_bezopasnost_mult',
       'audiosistema_mult', 'shini_i_diski_mult', 'electroprivod_mult',
       'fary_mult', 'multimedia_navigacia_mult', 'obogrev_mult',
       'pamyat_nastroek_mult', 'podushki_bezopasnosti_mult',
       'pomosh_pri_vozhdenii_mult', 'protivoygonnaya_sistema_mult',
       'salon_mult', 'upravlenie_klimatom_mult']
train_features
train_df[meta_features].head(3)
# X_train = train_features
y_train = train_df["price_TARGET"].values

for col in tqdm(meta_features):
    # if train_df[col].dtype == 'object':
    train_df[col] = train_df[col].fillna("0").astype(str)
    test_df[col] = test_df[col].fillna("0").astype(str) 

import pandas as pd
import numpy as np

# 1. Преобразуем численные признаки в DataFrame с именами столбцов
num_feature_names = [f'feat_{i}' for i in range(train_features.shape[1])]
train_num_df = pd.DataFrame(train_features, columns=num_feature_names)

# 2. Обработаем мета-признаки (списки → строки)
meta_df = train_df[meta_features].copy()
for col in meta_df.columns:
    meta_df[col] = meta_df[col].apply(lambda x: ', '.join(str(i) for i in x) if isinstance(x, list) else str(x) if pd.notna(x) else '')

# 3. Объединим
X_train = pd.concat([train_num_df, meta_df], axis=1)
from catboost import CatBoostRegressor

# Обучаем регрессор
cat_model = CatBoostRegressor(
    iterations=1000,
    # learning_rate=0.01,
    # depth=6,
    loss_function='MAE',
    verbose=100,
    random_seed=42,
    cat_features=meta_features,
    eval_metric='MAE'
    
)

cat_model.fit(X_train, y_train)
# 1. Преобразуем численные признаки в DataFrame с именами столбцов
num_feature_names = [f'feat_{i}' for i in range(test_features.shape[1])]
test_num_df = pd.DataFrame(test_features, columns=num_feature_names)

# 2. Обработаем мета-признаки (списки → строки)
meta_df = test_df[meta_features].copy()
for col in meta_df.columns:
    meta_df[col] = meta_df[col].apply(lambda x: ', '.join(str(i) for i in x) if isinstance(x, list) else str(x) if pd.notna(x) else '')

# 3. Объединим
X_test = pd.concat([test_num_df, meta_df], axis=1)
preds = cat_model.predict(X_test)

# Формируем submission
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "target": preds
})

submission.to_csv("submissionV15.csv", index=False)
print("✅ Submission saved to submission.csv")