In [1]:
# !pip install torch
# qwen-vl-utils transformers
# !pip install -U git+https://github.com/huggingface/transformers

In [1]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from vision_process import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


## Загрузка модели и процессора

In [3]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct", 
    # "models--Qwen--Qwen2-VL-2B-Instruct/snapshots/aca78372505e6cb469c4fa6a35c60265b00ff5a4/",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    # device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:09<00:00,  4.68s/it]


## Функции для обработки видео и ответов на вопросы

In [4]:
def process_video_and_answer(image_path, question, max_new_tokens=128):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_path,
                    "max_pixels": 360 * 420,
                    "fps": 1.0,
                },
                {"type": "text", "text": question},
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )
    # inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    
    return output_text[0]

def answer_multiple_choice(image_path, question, choices):
    base_question = f"{question}\nChoices:\n"
    for i, choice in enumerate(choices, 1):
        base_question += f"{i}. {choice}\n"
    base_question += "Please select the most appropriate answer by number."
    
    answer = process_video_and_answer(image_path, base_question)
    try:
        selected_number = int(answer.strip())
        return choices[selected_number - 1]
    except:
        return answer  # Возвращаем полный ответ, если не удалось извлечь номер

def generate_video_description(image_path):
    return process_video_and_answer(image_path, "Describe this video in detail.")

## Обработка датасета

In [5]:
def process_dataset(dataset):
    results = []
    for item in dataset:
        task_id = item['task_id']
        task_type = item['task_type']
        image_path = item['image']
        
        if task_type == 'qa':
            question = item['question']
            choices = [choice['choice'] for choice in item['choices']]
            answer = answer_multiple_choice(image_path, question, choices)
        elif task_type == 'captioning':
            answer = generate_video_description(image_path)
        
        results.append({
            'task_id': task_id,
            'answer': answer
        })
    
    return results

## Пример использования

In [6]:
# Пример датасета (замените на реальные данные)
sample_dataset = [
    {
        'task_id': 1,
        'task_type': 'qa',
        'image': '0b9649438a916859.jpg',
        'question': 'What on the image?',
        'choices': [
            {'choice_id': 1, 'choice': 'Accordion'},
            {'choice_id': 2, 'choice': 'Brokkoli'},
            {'choice_id': 3, 'choice': 'Hat'},
        ]
    },
    {
        'task_id': 2,
        'task_type': 'captioning',
        'image': '0b9649438a916859.jpg',
    }
]

results = process_dataset(sample_dataset)
for result in results:
    print(f"Task ID: {result['task_id']}")
    print(f"Answer: {result['answer']}")
    print()

Task ID: 1
Answer: Accordion

Task ID: 2
Answer: The video depicts a street scene in Berlin, featuring a man playing an accordion. The man is positioned in the foreground, smiling and looking towards the camera. He is wearing a flat cap and a suit, and he is holding an accordion with both hands. The accordion has a traditional design with a large, rectangular body and a series of buttons along the top edge.

In the background, there are several people sitting on benches, engaged in conversation or enjoying the surroundings. The setting appears to be a public square or a pedestrian area, with trees lining the street and a prominent structure in the background that resembles the Brandenburg Gate, a famous landmark



In [7]:
import os

In [8]:
os.listdir('dataset/')

['Insect',
 'Hippopotamus',
 'Spatula',
 'Apple',
 'Helmet',
 'Skull',
 'Lipstick',
 'Sparrow',
 'Scarf',
 'Jet ski',
 'Magpie',
 'Cat',
 'Rhinoceros',
 'Pancake',
 'Limousine',
 'Ant',
 'Jacket',
 'Seahorse',
 'Pear',
 'Piano',
 'Cello',
 '.DS_Store',
 'Frying pan',
 'Aircraft',
 'Belt',
 'Bow and arrow',
 'Wrench',
 'Alarm clock',
 'Wok',
 'Microwave oven',
 'Goldfish',
 'Whiteboard',
 'Wine rack',
 'Harp',
 'Accordion',
 'Zebra',
 'Camera',
 'Cucumber',
 'Alpaca',
 'Wheel',
 'Cosmetics',
 'Honeycomb',
 'Ambulance',
 'Fedora',
 'Goat',
 'Lily',
 'Toilet paper',
 'Parking meter',
 'Tap',
 'Earrings',
 'Vase',
 'Glasses',
 'Submarine',
 'Snowboard',
 'Christmas tree',
 'Cassette deck',
 'Tea',
 'Glove',
 'Coin',
 'Woodpecker',
 'Airplane',
 'Ipod',
 'Worm',
 'Animal',
 'Binoculars',
 'Isopod',
 'Invertebrate',
 'Monkey',
 'Whisk',
 'Flashlight',
 'Broccoli',
 'Sombrero',
 'Spoon',
 'Plastic bag',
 'Adhesive tape',
 'Bread',
 'Lighthouse',
 'Hat',
 'Rabbit',
 'Artichoke',
 'Bathtub',
 '

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from pathlib import Path
import numpy as np
from annoy import AnnoyIndex
import pickle
import os
from PIL import Image
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

class VLMImageSearch:
    def __init__(self):
        print("Loading Qwen-VL model...")
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            "Qwen/Qwen2-VL-2B-Instruct"
        )
        self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.model.eval()
        print(f"Model loaded successfully on {self.device}!")

    def get_embedding(self, image_path):
        try:
            # Загрузка и предобработка изображения
            image = Image.open(image_path).convert('RGB')
            
            # Подготовка входных данных для модели
            inputs = self.processor(
                text=["Describe this image."],  # Используем простой промпт
                images=[image],
                return_tensors="pt",
                padding=True
            ).to(self.device)

            with torch.no_grad():
                # Получаем эмбеддинги из последнего hidden state
                outputs = self.model(**inputs, output_hidden_states=True)
                # Берем hidden states последнего слоя vision encoder
                vision_hidden_states = outputs.hidden_states[-1]
                # Используем среднее значение по токенам как эмбеддинг изображения
                embedding = vision_hidden_states.mean(dim=1).squeeze().cpu().numpy()
                
            # Нормализация эмбеддинга
            embedding = embedding / np.linalg.norm(embedding)
            return embedding
            
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            return None

def process_dataset(dataset_path, encoder, save_dir="./data"):
    dataset_path = Path(dataset_path)
    
    embeddings_dict = {}
    file_mapping = {}
    class_mapping = {}
    reverse_class_mapping = {}
    class_stats = defaultdict(int)
    
    idx = 0
    print("Processing dataset...")
    
    for class_dir in tqdm(list(dataset_path.iterdir())):
        if class_dir.is_dir() and not class_dir.name.startswith('.'):
            class_name = class_dir.name
            for image_file in tqdm(class_dir.glob("*.*")):
                if image_file.suffix.lower() in ['.jpg', '.jpeg', '.png']:
                    embedding = encoder.get_embedding(str(image_file))
                    if embedding is not None:
                        embeddings_dict[idx] = embedding
                        file_mapping[idx] = image_file.stem
                        class_mapping[idx] = class_name
                        reverse_class_mapping[image_file.stem] = class_name
                        class_stats[class_name] += 1
                        idx += 1
    
    print(f"\nTotal images processed: {idx}")
    print("\nClass distribution:")
    for class_name, count in class_stats.items():
        print(f"{class_name}: {count} images")
    
    os.makedirs(save_dir, exist_ok=True)
    with open(f"{save_dir}/processed_data.pkl", "wb") as f:
        pickle.dump({
            'embeddings': embeddings_dict,
            'file_mapping': file_mapping,
            'class_mapping': class_mapping,
            'reverse_class_mapping': reverse_class_mapping,
            'class_stats': dict(class_stats)
        }, f)
    
    return embeddings_dict, file_mapping, class_mapping, reverse_class_mapping

def build_index(embeddings_dict, save_dir="./data"):
    first_embedding = next(iter(embeddings_dict.values()))
    embedding_dim = len(first_embedding)
    
    index = AnnoyIndex(embedding_dim, 'angular')
    
    print("Building index...")
    for idx, embedding in embeddings_dict.items():
        index.add_item(idx, embedding)
    
    print("Building index with 100 trees...")
    index.build(100)
    index.save(f"{save_dir}/image_index.ann")
    
    return index

def find_similar(query_image_path, index, encoder, file_mapping, class_mapping, n_results=10):
    query_embedding = encoder.get_embedding(query_image_path)
    if query_embedding is None:
        return []
    
    n_candidates = min(n_results * 3, len(file_mapping))
    similar_idx, distances = index.get_nns_by_vector(
        query_embedding, n_candidates, include_distances=True)
    
    filtered_results = []
    seen_classes = set()
    
    for idx, dist in zip(similar_idx, distances):
        class_name = class_mapping[idx]
        if len(filtered_results) < n_results:
            if class_name not in seen_classes:
                filtered_results.append(file_mapping[idx])
                seen_classes.add(class_name)
    
    while len(filtered_results) < n_results and similar_idx:
        idx = similar_idx[len(filtered_results)]
        filtered_results.append(file_mapping[idx])
    
    return filtered_results

def calculate_map10(predictions, true_classes):
    ap_scores = []
    
    for query_image, recommended_images in predictions.items():
        if query_image not in true_classes:
            continue
            
        true_class = true_classes[query_image]
        
        relevance = []
        for rec_image in recommended_images[:10]:
            rec_class = true_classes.get(rec_image)
            relevance.append(1 if rec_class == true_class else 0)
            
        precision_at_k = []
        relevant_count = 0
        
        for k, rel in enumerate(relevance, 1):
            if rel == 1:
                relevant_count += 1
                precision_at_k.append(relevant_count / k)
                
        ap = sum(precision_at_k) / min(10, sum(relevance)) if sum(relevance) > 0 else 0
        ap_scores.append(ap)
    
    map10 = sum(ap_scores) / len(ap_scores) if ap_scores else 0
    return map10

def evaluate_recommendations(test_dir, index, encoder, file_mapping, class_mapping, 
                           reverse_class_mapping, output_file="submission.csv"):
    test_path = Path(test_dir)
    results = []
    predictions = {}
    
    print("Generating recommendations for test images...")
    
    for image_file in tqdm(list(test_path.glob("*.*"))):
        if image_file.suffix.lower() in ['.jpg', '.jpeg', '.png']:
            similar_images = find_similar(
                str(image_file), index, encoder, file_mapping, 
                class_mapping, n_results=10
            )
            
            if similar_images:
                recs = ",".join(similar_images)
                results.append({
                    'image': image_file.stem,
                    'recs': f'"{recs}"'
                })
                predictions[image_file.stem] = similar_images
    
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    print(f"\nSubmission saved to {output_file}")
    
    test_classes = {}
    for image_stem in predictions.keys():
        class_dir = Path(str(test_path / image_stem)).parent.name
        test_classes[image_stem] = class_dir
    
    if test_classes:
        map10 = calculate_map10(predictions, {**reverse_class_mapping, **test_classes})
        print(f"\nMAP@10: {map10:.4f}")
    
    return predictions, map10

if __name__ == "__main__":
    # Инициализация поисковой системы
    encoder = VLMImageSearch()
    
    # Обработка датасета
    dataset_path = "dataset"
    embeddings_dict, file_mapping, class_mapping, reverse_class_mapping = process_dataset(
        dataset_path, encoder
    )
    
    # Создание индекса
    index = build_index(embeddings_dict)
    
    # Оценка на тестовом наборе
    test_dir = "test"
    predictions, map10 = evaluate_recommendations(
        test_dir, index, encoder, file_mapping, 
        class_mapping, reverse_class_mapping
    )

Loading Qwen-VL model...


Loading checkpoint shards: 100%|██████████████████| 2/2 [00:10<00:00,  5.49s/it]


Model loaded successfully on cpu!
Processing dataset...


  0%|                                                   | 0/106 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:17, 17.71s/it][A
2it [00:32, 15.92s/it][A
3it [00:46, 15.01s/it][A
4it [00:58, 13.82s/it][A
5it [01:14, 14.61s/it][A
6it [01:29, 14.90s/it][A
7it [01:42, 14.22s/it][A
8it [01:57, 14.29s/it][A
9it [02:14, 15.34s/it][A
10it [02:25, 13.96s/it][A
11it [02:54, 18.54s/it][A
12it [03:17, 19.95s/it][A
13it [03:28, 17.30s/it][A
14it [03:45, 17.01s/it][A
15it [04:01, 16.73s/it][A
16it [04:17, 16.61s/it][A
17it [04:36, 17.22s/it][A
18it [04:47, 15.36s/it][A
19it [05:03, 15.67s/it][A
20it [05:18, 15.42s/it][A
21it [05:35, 15.94s/it][A
22it [05:57, 17.67s/it][A
23it [06:03, 14.06s/it][A
24it [06:19, 14.76s/it][A
25it [06:31, 13.82s/it][A
26it [06:45, 14.02s/it][A
27it [06:59, 13.90s/it][A
28it [07:27, 18.28s/it][A
29it [07:41, 16.97s/it][A
30it [07:52, 15.16s/it][A
31it [08:21, 19.24s/it][A
32it [08:32, 16.83s/it][A
33it [08:49, 17.04s/it][A
34it [09:01, 15

In [12]:
print("Loading Qwen-VL model...")
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
print(f"Model loaded successfully on {device}!")

Loading Qwen-VL model...


Loading checkpoint shards: 100%|██████████████████| 2/2 [00:04<00:00,  2.08s/it]


Model loaded successfully on cpu!


In [14]:
# Загрузка изображения
image = Image.open('0b9649438a916859.jpg').convert('RGB')

# Предобработка
inputs = processor(
    text=["Describe this image."],
    images=[image],
    return_tensors="pt",
    padding=True
).to(device)

print("Input shape:", {k: v.shape for k, v in inputs.items()})

Input shape: {'input_ids': torch.Size([1, 4]), 'attention_mask': torch.Size([1, 4]), 'pixel_values': torch.Size([3552, 1176]), 'image_grid_thw': torch.Size([1, 3])}


In [18]:
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    
print("\nДоступные атрибуты outputs:")
for attr in dir(outputs):
    if not attr.startswith('_'):
        print(attr)
        
print("\nТипы и размеры hidden states:")
for i, hidden_states in enumerate(outputs.hidden_states):
    print(f"Layer {i}: shape = {hidden_states.shape}")


Доступные атрибуты outputs:
attentions
clear
copy
fromkeys
get
hidden_states
items
keys
logits
loss
move_to_end
past_key_values
pop
popitem
rope_deltas
setdefault
to_tuple
update
values

Типы и размеры hidden states:
Layer 0: shape = torch.Size([1, 4, 1536])
Layer 1: shape = torch.Size([1, 4, 1536])
Layer 2: shape = torch.Size([1, 4, 1536])
Layer 3: shape = torch.Size([1, 4, 1536])
Layer 4: shape = torch.Size([1, 4, 1536])
Layer 5: shape = torch.Size([1, 4, 1536])
Layer 6: shape = torch.Size([1, 4, 1536])
Layer 7: shape = torch.Size([1, 4, 1536])
Layer 8: shape = torch.Size([1, 4, 1536])
Layer 9: shape = torch.Size([1, 4, 1536])
Layer 10: shape = torch.Size([1, 4, 1536])
Layer 11: shape = torch.Size([1, 4, 1536])
Layer 12: shape = torch.Size([1, 4, 1536])
Layer 13: shape = torch.Size([1, 4, 1536])
Layer 14: shape = torch.Size([1, 4, 1536])
Layer 15: shape = torch.Size([1, 4, 1536])
Layer 16: shape = torch.Size([1, 4, 1536])
Layer 17: shape = torch.Size([1, 4, 1536])
Layer 18: shape = 

In [None]:
# Ячейка 1: Импорты и настройка
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image
import numpy as np

# Ячейка 2: Загрузка модели и процессора
def load_model_and_processor():
    print("Loading Qwen-VL model...")
    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    print(f"Model loaded successfully on {device}!")
    return model, processor, device

# Ячейка 3: Тестирование на одном изображении
def process_single_image(model, processor, device, image_path):
    # Загрузка изображения
    image = Image.open(image_path).convert('RGB')
    
    # Предобработка
    inputs = processor(
        text=["Describe this image."],
        images=[image],
        return_tensors="pt",
        padding=True
    ).to(device)
    
    print("Input shape:", {k: v.shape for k, v in inputs.items()})
    return inputs

# Ячейка 4: Получение и анализ hidden states
def get_hidden_states(model, inputs):
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        
    print("\nДоступные атрибуты outputs:")
    for attr in dir(outputs):
        if not attr.startswith('_'):
            print(attr)
            
    print("\nТипы и размеры hidden states:")
    for i, hidden_states in enumerate(outputs.encoder_hidden_states):
        print(f"Layer {i}: shape = {hidden_states.shape}")
    
    return outputs

# Ячейка 5: Извлечение эмбеддинга
def extract_embedding(outputs):
    # Берем последний слой hidden states
    last_hidden_states = outputs.encoder_hidden_states[-1]
    print("\nРазмерность последнего слоя:", last_hidden_states.shape)
    
    # Усредняем по токенам
    embedding = last_hidden_states.mean(dim=1).squeeze().cpu().numpy()
    print("Размерность эмбеддинга:", embedding.shape)
    
    # Нормализация
    embedding = embedding / np.linalg.norm(embedding)
    print("Норма эмбеддинга:", np.linalg.norm(embedding))
    
    return embedding

# Ячейка 6: Полный пайплайн для тестирования
def test_embedding_pipeline(image_path):
    # Загрузка модели
    model, processor, device = load_model_and_processor()
    
    # Обработка изображения
    inputs = process_single_image(model, processor, device, image_path)
    
    # Получение hidden states
    outputs = get_hidden_states(model, inputs)
    
    # Извлечение эмбеддинга
    embedding = extract_embedding(outputs)
    
    return embedding

# Ячейка 7: Запуск тестирования
image_path = "dataset/Accordion/01cc22eb34653a82.jpg"  # Укажите путь к тестовому изображению
embedding = test_embedding_pipeline(image_path)

# Ячейка 8: Визуализация эмбеддинга
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(embedding)
plt.title('Embedding Values')
plt.xlabel('Dimension')
plt.ylabel('Value')

plt.subplot(1, 2, 2)
plt.hist(embedding, bins=50)
plt.title('Embedding Distribution')
plt.xlabel('Value')
plt.ylabel('Count')
plt.tight_layout()
plt.show()