In [56]:
#Image reading, writing, and plotting
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

#Numpy
import numpy as np

#File management
import pandas as pd
import os
import json

#Extract Colors
import colorgram

#Random
import random

%matplotlib inline

In [57]:
with open("pokedex.json") as xyz:
    pokedex = json.load(xyz)

# Select which folders to convert (or all of them) and number of colors

In [58]:
data_folder = os.path.join(os.getcwd(), "data")

#Automatically converts every single folder
all_gen_folders = sorted([f for f in os.listdir(data_folder) if (not f.startswith(".") and 
                              os.path.isdir(os.path.join(data_folder, f)))])
# !! ALTERNATIVE !!: 
# Uncomment the below line to automatically convert only the specified folders
all_gen_folders = ["rby", "gsc"]

num_colors = 5

print("folders to convert :: ", all_gen_folders)
print("number of colors in 1-hot Vector :: ", num_colors)

folders to convert ::  ['rby', 'gsc']
number of colors in 1-hot Vector ::  5


In [59]:
#Establish what the columns of the dictionary are going to be
#And initialize the values
row_keys = ["pokemon","dex_number"]
for i in range(num_colors):
    row_keys.append("red"       +str(i))
    row_keys.append("blue"      +str(i))
    row_keys.append("green"     +str(i))
for i in range(num_colors):
    row_keys.append("proportion"+str(i))
row_vals = [""] + [0] + ([0] * (4*num_colors) )


for gen_folder in all_gen_folders:
    print("***********************************************")
    print(gen_folder)
    print("***********************************************")
    data_gen_folder = os.path.join(data_folder, gen_folder)
    
    ###
    # gather pokeon
    #
    pkm_img_list = [pkm for pkm in os.listdir(data_gen_folder) if pkm.endswith(".png")]
    
    ###
    # Create csv list (has rows of Pokemon, num, and [types]). Each row is a dictionary
    ###
    csv_list = []
    
    for pkm_img_str in pkm_img_list:
        data_pkm_loc = os.path.join(data_gen_folder, pkm_img_str)
        pkm_string = pkm_img_str[0:pkm_img_str.index(".png")] #Pokemon's name without .png
        row_dict = dict(zip(row_keys, row_vals))
        
        #Extract the colors
        #Note:: Don't include the most prominent color, since it's the background
        colors = colorgram.extract(data_pkm_loc, num_colors+1)[1:] 
        
        #Save to the row dictionary
        row_dict["pokemon"] = pkm_img_str
        row_dict["dex_number"] = pokedex[pkm_string]["num"]
        for i in range(num_colors):
            row_dict["red"+str(i)], row_dict["blue"+str(i)], row_dict["green"+str(i)] = colors[i].rgb
            row_dict["proportion"+str(i)] = np.round(colors[i].proportion,4)
        
        #Add the row to the csv list (essentially the dataframe)
        csv_list.append(row_dict)
        
    df = pd.DataFrame(csv_list, columns=row_keys)
    df.sort_values("dex_number", inplace=True, ascending=True)
    writeable_df = df.to_csv(os.path.join(data_gen_folder, "aa_master_feature_list.csv"), encoding="utf-8", index=False)
    print("Completed generating the 1-Hot Vectors for folder :: ", gen_folder)

print("Finishing generating all indicated 1-hot CSV files")

***********************************************
rby
***********************************************
Completed generating the 1-Hot Vectors for folder ::  rby
***********************************************
gsc
***********************************************
Completed generating the 1-Hot Vectors for folder ::  gsc


# Generate Train/Test Set

In [92]:
data_folder = os.path.join(os.getcwd(), "data")

#Automatically converts every single folder that has the 1-hot vectors
all_set_gen_folders = sorted([f for f in os.listdir(data_folder) if (not f.startswith(".") and 
                              os.path.isdir(os.path.join(data_folder, f)) and
                              os.path.isfile(os.path.join(data_folder, f, "aa_master_feature_list.csv")))])
# !! ALTERNATIVE !!: 
# Uncomment the below line to automatically convert only the specified folders
# all_set_gen_folders = ["xydex"]

print("Folders to be used :: ",all_set_gen_folders)

Folders to be used ::  ['gsc', 'rby', 'xydex']


In [93]:
method = {}
method["method"] = "gen"
#num indicates the test set generation
method["num"] = 7
#dev_prob is the percentage of images put into the dev set
method["dev_prob"] = 0.0

#Puts the image into the dev set based on teh probability set above
def putInDev(probability):
    return random.random() < probability

In [97]:
train_list = []
test_list = []
dev_list = []

dev_prob = method["dev_prob"]

for folder in all_set_gen_folders:
    gen_folder = os.path.join(data_folder, folder)
    
    #Read the master list csv file from each folder
    #Merge the two csv files
    #Then convert the csv files into a list of dictionaries for easy manipulation
    gen_df = pd.read_csv(os.path.join(gen_folder, "aa_masterlist.csv"))
    hot_df = pd.read_csv(os.path.join(gen_folder, "aa_master_feature_list.csv"))
    #Check that gen_df and hot_df have the same length
    if(len(gen_df) != len(hot_df)):
        raise ValueError(gen_folder+" does not have equal row lengths in aa_master_feature_list.csv and aa_masterlist.csv")
    new_df = pd.concat([gen_df, hot_df.iloc[:, 2:]], axis=1)
    df_list = new_df.to_dict(orient="records")
    
    #Use the "gen" criteria to determine the train/test split
    if(method["method"]=="gen"):
        if(method["method"] == "gen"):
        #Gen numbers represent the bounds of the pokedex numbers (check bulbapedia, ya nerd)
            num = method["num"]
            #Note: bounds are inclusive
            if num == 1:
                min_bnd = 1
                max_bnd = 151

            elif num == 2:
                min_bnd = 152
                max_bnd = 251

            elif num == 3:
                min_bnd = 252
                max_bnd = 386

            elif num == 4:
                min_bnd = 387
                max_bnd = 493

            elif num == 5:
                min_bnd = 494
                max_bnd = 649

            elif num == 6:
                min_bnd = 650
                max_bnd = 721

            elif num == 7:
                min_bnd = 722
                max_bnd = 809

            else:
                print("This generation doesn't exist at the time of the creation of this code")
                print("Please manually add the generation index to the if/else case")
                raise ValueError("The Generation doesn't exist")

            #Add the row to the respective train, test, or dev list
            for i in range(0, len(df_list)):
                df_list[i]["pokemon"] = os.path.join(gen_folder, df_list[i]["pokemon"])
                dex_num = df_list[i]["dex_number"]

                if dex_num >= min_bnd and dex_num <= max_bnd:
                    test_list.append(df_list[i])
                else:
                    if putInDev(dev_prob):
                        dev_list.append(df_list[i])
                    else:
                        train_list.append(df_list[i])

                    
                    
#Convert the lists into pandas dataframes, and then write to the /data folder
train_df = pd.DataFrame(train_list, columns=list(new_df))
train_df.sort_values("dex_number", inplace = True, ascending = True)
write_train = train_df.to_csv(os.path.join(data_folder, "Train_hot.csv"), encoding="utf-8", index=False)

test_df = pd.DataFrame(test_list, columns=list(new_df))
test_df.sort_values("dex_number", inplace = True, ascending = True)
write_test = test_df.to_csv(os.path.join(data_folder, "Test_hot.csv"), encoding="utf-8", index=False)

dev_df = pd.DataFrame(dev_list, columns=list(new_df))
dev_df.sort_values("dex_number", inplace = True, ascending = True)
write_dev = dev_df.to_csv(os.path.join(data_folder, "Dev_hot.csv"), encoding="utf-8", index=False)

print("Succesfully finished generating Train_hot.csv, Dev_hot.csv and Test_hot.csv")
print("Train_hot.csv has ", len(train_list), "images")
print("Dev_hot.csv has ", len(dev_list), " images")
print("Test_hot.csv has ", len(test_list), " images")

Succesfully finished generating Train_hot.csv, Dev_hot.csv and Test_hot.csv
Train_hot.csv has  1336 images
Dev_hot.csv has  0  images
Test_hot.csv has  124  images


# Test a Dataset

In [88]:
# df = pd.read_csv(os.path.join(data_folder, "xydex","aa_master_feature_list.csv"))
# print(df.iloc[0])
# print(df.iloc[177])
# print(df.iloc[752])
#df.loc[df["pokemon"]=="magmar.png"].dex_number == 127

some_folder = os.path.join(os.getcwd(), "data", "xydex")

gen_df = pd.read_csv(os.path.join(some_folder, "aa_masterlist.csv"))
hot_df = pd.read_csv(os.path.join(some_folder, "aa_master_feature_list.csv"))
new_df = pd.concat([gen_df, hot_df.iloc[:,2:]], axis=1)
print(len(new_df))
# gen_list = gen_df.to_dict(orient="records")

# #includes the information

# hot_list = hot_df.to_dict(orient="records")

1058
