# **Face Recognition Project**

### Загрузка данных

In [43]:
import requests
from urllib.parse import urlencode
import numpy as np

In [2]:
base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
public_key = 'https://disk.yandex.ru/d/S8f03spLIA1wrw'

# Получаем загрузочную ссылку
final_url = base_url + urlencode(dict(public_key=public_key))
response = requests.get(final_url)
download_url = response.json()['href']

# Загружаем файл и сохраняем его
download_response = requests.get(download_url)
with open('downloaded_file.zip', 'wb') as f:
    f.write(download_response.content)

Разархивируем downloaded_file.zip

In [3]:
import zipfile

with zipfile.ZipFile('downloaded_file.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_files')

Удалим ненужные файлы

In [4]:
import os

filename = 'downloaded_file.zip'

# Если файл существует, удаляем
if os.path.exists(filename):
    os.remove(filename)
    print(filename + " deleted")

downloaded_file.zip deleted


In [5]:
import shutil

filename = '/content/extracted_files/__MACOSX'

# Если файл существует, удаляем
if os.path.exists(filename):
    shutil.rmtree(filename)
    print(filename + " deleted")

/content/extracted_files/__MACOSX deleted


    Внутри:
            - celebA_imgs — папка с выровненными картинками;
            - celebA_anno.txt — файл с аннотацией — каждой картинке из celebA_imgs поставлен в соответствие ее id;
            - celebA_train_split.txt — файл со сплитом на train/val/test.

In [6]:
import pandas as pd

In [None]:
df_attrs = pd.read_csv("/content/extracted_files/celebA_train_500/celebA_anno.txt", sep='\s',  names=['images', 'labels'], engine='python')
df_attrs

Unnamed: 0,images,labels
0,000001.jpg,0
1,000404.jpg,0
2,003415.jpg,0
3,004390.jpg,0
4,018062.jpg,0
...,...,...
12006,126297.jpg,499
12007,129725.jpg,499
12008,132679.jpg,499
12009,151415.jpg,499


In [7]:
from PIL import Image
import torch
from torch.utils.data import Dataset

In [22]:
data_modes = ["train", "val", "test"]

In [23]:
df_anno = pd.read_csv("/content/extracted_files/celebA_train_500/celebA_anno.txt", sep='\s+', header=None)
df_split = pd.read_csv("/content/extracted_files/celebA_train_500/celebA_train_split.txt", sep='\s+', header=None)

In [77]:
ids = np.array(['000001.jpg', '000404.jpg', '101501.jpg'])

In [80]:
df_split[df_split[1] == 0][0].values

array(['000001.jpg', '000404.jpg', '003415.jpg', ..., '087286.jpg',
       '089001.jpg', '101501.jpg'], dtype=object)

In [114]:
class CelebADataset(Dataset):
    def __init__(self, img_dir, anno_file, split_file, mode):
        self.mode = mode
        if self.mode not in data_modes:
            print(f"{self.mode} is not correct; correct modes: {data_modes}")
            raise NameError
        self.img_dir = img_dir
        self.anno = pd.read_csv(anno_file, sep='\s+', header=None)
        self.train_split = pd.read_csv(split_file, sep='\s+', header=None)

        if mode == "train":
            self.img_ids = self.train_split[self.train_split[1] == 0][0].values
            self.indices = {img_id: idx for idx, img_id in enumerate(self.img_ids)}
            self.labels = {img_id: label for img_id, label in self.anno[self.anno[0].isin(self.img_ids)].values}
        elif mode == "val":
            self.img_ids = self.train_split[self.train_split[1] == 1][0].values
            self.indices = {img_id: idx for idx, img_id in enumerate(self.img_ids)}
            self.labels = {img_id: label for img_id, label in self.anno[self.anno[0].isin(self.img_ids)].values}
        else:
            self.img_ids = self.train_split[self.train_split[1] == 2][0].values
            self.indices = {img_id: idx for idx, img_id in enumerate(self.img_ids)}
            self.labels = {img_id: label for img_id, label in self.anno[self.anno[0].isin(self.img_ids)].values}

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, idx):
        img_id = self.img_ids[idx]
        img_path = os.path.join(self.img_dir, img_id)

        image = np.array(Image.open(img_path))
        image = Image.fromarray(image[77:-41, 45:-50])
        label = self.labels[img_id]
        return image, label

In [115]:
tr_dataset = CelebADataset(
    img_dir="/content/extracted_files/celebA_train_500/celebA_imgs",
    anno_file="/content/extracted_files/celebA_train_500/celebA_anno.txt",
    split_file="/content/extracted_files/celebA_train_500/celebA_train_split.txt",
    mode="train"
)

In [116]:
from torch.utils.data import DataLoader

In [117]:
dataloader = DataLoader(tr_dataset, batch_size=32)