# データセットの作成

### ライブラリ等の読み込み

In [None]:
import pandas as pd
from PIL import Image
import numpy as np
import glob
from matplotlib import pylab as plt
from multiprocessing import Pool 

**念の為path確認**

In [None]:
!pwd

### データセットの読み込み関数

type_dict : タイプ<->数値の対応表

load images で 画像を読み込み，np形式に変換

load type で　タイプを読み込み，数値に変換

return :

- X : 画像（データ）, X.shape -> (1602, 215, 215, 4)
- Y : タイプ(ラベル), Y.shape -> (1602,)


ポケモン毎に画像は２枚割り当てている．これは，タイプが１つのポケモンと２つのポケモンがいる為．以下参照
- １タイプのポケモン : 2つの画像に同じタイプ
- ２タイプのポケモン : ２つの画像にそれぞれ別タイプ

In [None]:
def load_data():
    type_dict = {
        "Normal":0,
        "Fire":1,
        "Water":2,
        "Electric":3,
        "Grass":4,
        "Ice":5,
        "Fighting":6,
        "Poison":7,
        "Ground":8,
        "Flying":9,
        "Psychic":10,
        "Bug":11,
        "Rock":12,
        "Ghost":13,
        "Dragon":14,
        "Dark":15,
        "Steel":16,
        "Fairy":17
    }
    
    # load images
    images = _read_images()
    images = np.vstack((images,images))
    
    # load type
    df = pd.read_csv('../data/Pokemon.csv', sep=',')
    df.drop_duplicates(subset='Number', inplace=True)
    df.reset_index(inplace=True, drop=True)
    ind = df[df['Type2'].isnull()]['Type2'].index
    df.iloc[ind, 3] = df.iloc[ind, 2]
    df_1 = df[["Type1"]][:801]
    df_1.rename(columns={'Type1': 'Type'}, inplace=True)
    df_2 = df[["Type2"]][:801]
    df_2.rename(columns={'Type2': 'Type'}, inplace=True)
    df = pd.concat([df_1, df_2])
    df.reset_index(inplace=True, drop=True)
    df = df["Type"].map(type_dict)
    types = df.values

    X = images
    Y = types
    return X,Y

def _read_image(path):
    img = Image.open(path)
    return np.array(img)

def _read_images():
    image_paths = glob.glob("../data/images/*")
    image_paths.sort()
    
    
    with Pool() as p:
        arr = p.map(read_image, image_paths)
        
    return arr

#### データセットの読み込み

In [None]:
%time X,Y = load_data()

In [None]:
X.shape

タイプ一覧を用意する

In [None]:
types = [
        "Normal",
        "Fire",
        "Water",
        "Electric",
        "Grass",
        "Ice",
        "Fighting",
        "Poison",
        "Ground",
        "Flying",
        "Psychic",
        "Bug",
        "Rock",
        "Ghost",
        "Dragon",
        "Dark",
        "Steel",
        "Fairy"
]

#### データの確認

In [None]:
# 各ラベルごとに画像を10枚格納
# img_list = []
# for for_1 in range(18):
#     choice_idx = np.random.choice(np.where(Y == for_1)[0], 10, replace=False)
#     img_list.append(X[choice_idx])

In [None]:
# データの可視化
# for for_1 in range(18):
#     fig, ax = plt.subplots(1, 10, figsize=(18, 8))
#     for for_2 in range(10):
#         ax[for_2].imshow(img_list[for_1][for_2].reshape(215, 215, 4))
#         ax[for_2].set_title(types[for_1])
#         ax[for_2].axis('off')