In [1]:
import re
import os
import itertools
import networkx as nx
import time
from IPython.display import clear_output
import numpy as np

In [2]:
episode_dir = "./episodes_files_raw/"
episode_names = sorted(os.listdir("episodes_files_raw/"))

## Create dictionary with Episode tags as keys and list of Pokemon in the Episode as value

In [4]:
episodes_with_pokemon_list = {}

for episode_tag in episode_names:
    episode_path = episode_dir + episode_tag
    episode_file = open(episode_path, encoding='utf-8').read()
    episode_pokemons_text = re.findall(r"(?<=Pokémon==)(.*)(?=Trivia)", episode_file)
    # following episodes: [0294]AG018;[0330]AG054;[0458]AG182;[0477]DP009;[0520]DP052
    # have different text formatting, hence an if workaround
    if len(episode_pokemons_text) == 0:
        episode_pokemons_text = re.findall(r"(?<=Pokémon ==)(.*)(?=Trivia)", episode_file)
    episode_pokemons_text = episode_pokemons_text[0]
    pokemons_in_episode = list(set(re.findall("(?<=\{\{[\w+][|])(.*?)(?=\})", episode_pokemons_text)))
    
    episodes_with_pokemon_list[episode_tag] = pokemons_in_episode

## Generate all possible edges from all the Pokemon who were together in the same episodes

In [5]:
all_edges = []
for episode_tag, pokemons in episodes_with_pokemon_list.items():
    # all pokemon pairs from the episode is a binomial of len(pokemons) and 2
    list_of_nodes_from_episode = list(itertools.combinations(sorted(pokemons), 2))
    all_edges += list_of_nodes_from_episode

### Read Pokemon list file to get all Pokemon names

In [6]:
pokemon_list_file = "./pokemon list.txt"
pokemon_list = []

pokemon_list = open(pokemon_list_file, encoding='utf-8').read().split("\n")[:-1] # last element is empty line

## Create a Graph

In [7]:
G = nx.Graph()
G.add_nodes_from(pokemon_list)

for i, pokemon_edge in enumerate(all_edges):
    pokemon1 = pokemon_edge[0]
    pokemon2 = pokemon_edge[1]
    
    if G.has_edge(pokemon1, pokemon2):
        G[pokemon1][pokemon2]['weight'] += 1
    else:
        G.add_edge(pokemon1, pokemon2, weight=1)
    # print current status    
    if (i+1) % round(0.01 * len(all_edges)) == 0:
        clear_output(wait=True)
        current_percent = (i+1) / round(0.01 * len(all_edges))
        print(f"{current_percent}% done ({i+1} out of {len(all_edges)})")
    
clear_output(wait=True)
print("100% done.")

100% done.


### Browsing Pokemon who met the most of other Pokemon

TODO: there is a double record for Charizard: "Charizard" and "Charizard|Charizard"; However, even with the fix Charizard met less Pokemon than Eevee

In [8]:
sorted(list(G.degree), key=lambda x: x[1], reverse=True)

[('Pikachu', 809),
 ('Meowth', 786),
 ('Wobbuffet', 725),
 ('Eevee', 618),
 ('Charizard', 580),
 ('Marill', 576),
 ('Psyduck', 565),
 ('Bulbasaur', 555),
 ('Oddish', 542),
 ('Growlithe', 532),
 ('Rotom', 528),
 ('Piplup', 526),
 ('Vulpix', 524),
 ('Wooper', 501),
 ('Magikarp', 500),
 ('Smeargle', 499),
 ('Jigglypuff', 497),
 ('Seviper', 497),
 ('Staryu', 495),
 ('Garchomp', 490),
 ('Munchlax', 484),
 ('Poliwag', 483),
 ('Butterfree', 482),
 ('Chansey', 480),
 ('Rattata', 478),
 ('Dedenne', 476),
 ('Caterpie', 475),
 ('Sudowoodo', 472),
 ('Mime Jr.', 468),
 ('Magnemite', 466),
 ('Aipom', 466),
 ('Corphish', 466),
 ('Tauros', 465),
 ('Squirtle', 464),
 ('Marowak', 460),
 ('Wingull', 460),
 ('Skitty', 460),
 ('Umbreon', 453),
 ('Fletchling', 452),
 ('Emolga', 450),
 ('Elekid', 447),
 ('Spearow', 441),
 ('Wigglytuff', 440),
 ('Bellsprout', 440),
 ('Gyarados', 440),
 ('Pidgey', 438),
 ('Persian', 437),
 ('Poliwhirl', 437),
 ('Exeggutor', 437),
 ('Teddiursa', 437),
 ('Makuhita', 436),
 ('Pac

### Average number of met Pokemon for all Pokemon

In [None]:
np.mean([x[1] for x in G.degree])