In [276]:
# Load the necessary packages
import json, re, networkx as nx, numpy as np, os, sys, nltk
from tqdm import tqdm_notebook

### Data Analysis - Cleansing of the html data

In [267]:
# Functions to use for cleaning the pokemon data. Used in context with the regex.
def my_replace(s):
    if len(s[0].split("|")) == 1:
        return s[0][2:-2]
    else:
        return s[0].split("|")[-1][:-2]

def my_replace2(s):
    if len(s[0].split("|")) == 1:
        return s[0].split("[[")[1][:-2]
    else:
        return s[0].split("|")[-1][:-2]

In [271]:
poke_biology = {}

# Loop through all the pokemon files
for file in tqdm_notebook(os.listdir("pokemon_files_raw/")):
    path = 'pokemon_files_raw/' + file
    poke_file = open(path, encoding='utf-8').read()  # Load the file 
    poke_name = file.split(']')[-1]                  # Save the name of the pokemon

    # Get the biology text 
    try:
        # this finds the content in the biology. Section after biology tends to be "in the anime"
        bio_text = re.findall("(?<=Biology==)(.*)(?===In the anime)", poke_file)[0] 
        
        # this finds the links and replaces the links with the hyperlink text. Also removes newline charcs and random \
        bio_text = re.sub("{{[\wé|\s\( \)',-]+}}", my_replace , bio_text).replace("\\n", " ").replace("\\", "")
        
        # Does the same as before but with the square brackets instead
        bio_text = re.sub("\[\[[\w\s|é'\\.#:!,-]+\]\]", my_replace2, bio_text)
        
        # Some pokemon contains html object <ref>. These are removed
        bio_text = re.sub("(<ref>)(.*)<\/ref>", "", bio_text)
        poke_biology[poke_name] = bio_text
    except:
        print(poke_name)
        continue
    

HBox(children=(IntProgress(value=0, max=809), HTML(value='')))

Type



### Preparing the data - Tokenization

In [300]:
# Tokenize the biology description and exclude stopwords
from nltk.corpus import stopwords
poke_biology_tokens = {}
poke_biology_tokens_stemmed = {}
porter = nltk.PorterStemmer()

# Define the stopword set. Include punctuation and other symbols
stop_wrds = set(stopwords.words('english'))
stop_wrds.update(['.',',','!','?',':',';'])

for pokemon in tqdm_notebook(poke_biology):
    bio_text = poke_biology[pokemon]
    poke_tokens = nltk.word_tokenize(bio_text)
    poke_biology_tokens[pokemon] = [word.lower() for word in poke_tokens if word.lower() not in stop_wrds 
                   and len(re.findall('\A[a-zA-Z]+', word)) > 0]
    
    # This creates a stemmed version of the biology text tokens
    poke_biology_tokens_stemmed[pokemon] = [porter.stem(word.lower()) for word in poke_tokens  
                                            if word.lower() not in stop_wrds and len(re.findall('\A[a-zA-Z]+', word)) > 0]

HBox(children=(IntProgress(value=0, max=808), HTML(value='')))




### Sentiment analysis of the biology description

In [308]:
import urllib, pandas as pd

# Get the LabMT lexicon to use for the sentiment analysis
url = 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0026752.s001&type=supplementary'
df_sentiment = pd.read_csv(url, skiprows=2, header=None, sep='\t')

# need to set the column names and remove the first row
df_sentiment.columns = df_sentiment.iloc[0]
df_sentiment = df_sentiment.drop(df_sentiment.index[0]).reset_index()
df_sentiment = df_sentiment.drop(['index'], axis=1)
df_sentiment[['happiness_rank', 'happiness_average', 'happiness_standard_deviation']] = \
df_sentiment[['happiness_rank', 'happiness_average', 'happiness_standard_deviation']].apply(pd.to_numeric)
print(df_sentiment.head())

# Define a function to compute the sentiment of a list of tokens
def calculate_token_list_sentiment(tokens):
    no_words_labeled = len(tokens)
    happines_sum = 0
    
    word_count = 0
    for word in tokens:
        if word in list(df_sentiment['word']):
            word_count += 1
            happines_sum += df_sentiment[df_sentiment['word'] == word]['happiness_average'].item()    
    if word_count == 0:
        return -1
    return happines_sum / word_count

In [322]:
poke_sentiment = {}
poke_sentiment_stemmed = {}
for pokemon in tqdm_notebook(poke_biology_tokens):
    poke_sentiment[pokemon] = calculate_token_list_sentiment(poke_biology_tokens[pokemon])
    poke_sentiment_stemmed[pokemon] = calculate_token_list_sentiment(poke_biology_tokens_stemmed[pokemon])
    

HBox(children=(IntProgress(value=0, max=808), HTML(value='')))




In [372]:
# Loop through all the pokemon files
poke_types = {}
count = 0
for file in tqdm_notebook(os.listdir("pokemon_files_raw/")):
    count += 1
    path = 'pokemon_files_raw/' + file
    poke_file = open(path, encoding='utf-8').read()  # Load the file 
    poke_name = file.split(']')[-1]                  # Save the name of the pokemon

    # Find where the type is written in the html files with regex
    try:
        types = re.findall("{{PokémonPrevNext\/Head\|type=[\w+|=\s]+}}", poke_file)[0]
    except:
        continue
    types = re.findall("type=[\w\|=]+", types)[0]
    types = types.split('=')
    
    # Check if pokemon has 1 or 2 types
    if len(types) == 2:
        poke_type = (types[1])
    elif len(types) == 3:
        types = types[1:]
        poke_type = (types[0].split("|")[0], types[1])
    poke_types[poke_name] = poke_type

HBox(children=(IntProgress(value=0, max=809), HTML(value='')))




### Find the statistics of the pokemons

In [430]:
stats = {'HP': [], 'Attack' : [], 'Defense' : [], 'SpAtk' : [], 'SpDef' : [], 'Speed' : []}
for file in tqdm_notebook(os.listdir("pokemon_files_raw/")):
    path = 'pokemon_files_raw/' + file
    poke_file = open(path, encoding='utf-8').read()  # Load the file 
    poke_name = file.split(']')[-1]                  # Save the name of the pokemon
    
    # Use regex to locate the statistics with positive lookbehinds
    poke_stat = (re.findall("((?<=HP=)[\s\d]+)|((?<=Attack=)[\s\d]+)|((?<=Defense=)[\s\d]+)|((?<=SpAtk=)[\s\d]+)|((?<=SpDef=)[\s\d]+)|((?<=Speed=)[\s\d]+)"
           , poke_file)[:6])
    try:
        stats['HP'].append(int(poke_stat[0][0]))
        stats['Attack'].append(int(poke_stat[1][1]))
        stats['Defense'].append(int(poke_stat[2][2]))
        stats['SpAtk'].append( int(poke_stat[3][3]))
        stats['SpDef'].append(int(poke_stat[4][4]))
        stats['Speed'].append(int(poke_stat[5][5]))
    except:
        print(poke_name)

HBox(children=(IntProgress(value=0, max=809), HTML(value='')))

Type



### Putting it all together

In [437]:
# Create a pandas dataframe to store all the data for easier access to statistics
data = {'Pokémon' : list(poke_biology.keys()), 'Sentiment' : list(poke_sentiment.values()), 
        'Sentiment_stemmed' : list(poke_sentiment_stemmed.values())  ,'Type': list(poke_types.values()) , 
        'HP' : stats['HP'], 'Attack' : stats['Attack'], 'Defense' : stats['Defense'], 'SpAtk' :stats['SpAtk'], 
        'SpDef' : stats['SpDef'], 'Speed' : stats['Speed']}

poke_df = pd.DataFrame(data = data ,columns=['Pokémon','Sentiment','Sentiment_stemmed','Type','HP','Attack','Defense','SpAtk','SpDef','Speed'])

In [439]:
poke_df.to_csv('pokemon_data')

In [438]:
os.listdir()

['.ipynb_checkpoints',
 'download_data.ipynb',
 'Pokemon processing.ipynb',
 'pokemon_files_raw']