<img src="https://bit.ly/2VnXWr2" width="100" align="left">

# Project | API and Web Data Scraping

## Import Useful Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import re
import math
import json
import requests
from pandas import json_normalize
from bs4 import BeautifulSoup

## Working On The API

### Getting the needed information from the API

In [2]:
url = "https://pikalytics.com/api/p/2020-01/ss-1760"

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}

response = requests.get(url, headers=headers)
results = response.json()
flattened_data = json_normalize(results)
pikalyticsDF = flattened_data

TypeError: 'bool' object is not iterable

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until


### Drop the columns I don't need and clean the remaining information

In [4]:
pikalyticsDF = pikalyticsDF.drop([
    'abilities', 'raw_count', 'items', 'spreads', 'moves', 'stats.hp',
    'stats.atk', 'stats.def', 'stats.spa', 'stats.spd', 'stats.spe', 'ss'
],
                                 axis=1)

NameError: name 'pikalyticsDF' is not defined

### As I had issues to properly cleaning the column team I will make a copy from it and treat it individually.

In [None]:
teams = pikalyticsDF["team"].tolist()

In [None]:
teamsDF = pd.DataFrame(teams)

In [None]:
teamsDF = teamsDF.drop([3, 4, 5, 6, 7, 8, 9, 10, 11], axis=1)

In [None]:
teamsDF = teamsDF.rename(
    columns={
        0: "Most common teammate",
        1: "2nd most common teammate",
        2: "3rd most common teammate"
    })
teamsDF.head()

In [None]:
teamsDF_First = teamsDF["Most common teammate"].tolist()
teamsDF_First = pd.DataFrame(teamsDF_First)
teamsDF_First["Most common teammate"] = teamsDF_First["pokemon"].astype(
    str) + " " + teamsDF_First["percent"].astype(
        str) + "% " + teamsDF_First["types"].astype(str)
teamsDF_First = teamsDF_First.drop(["pokemon", "percent", "types"], axis=1)

In [None]:
teamsDF_Second = teamsDF["2nd most common teammate"].tolist()
teamsDF_Second = pd.DataFrame(teamsDF_Second)
teamsDF_Second["2nd most common teammate"] = teamsDF_Second["pokemon"].astype(
    str) + " " + teamsDF_Second["percent"].astype(
        str) + "% " + teamsDF_Second["types"].astype(str)
teamsDF_Second = teamsDF_Second.drop(["pokemon", "percent", "types"], axis=1)

In [None]:
teamsDF_Third = teamsDF["3rd most common teammate"].tolist()
teamsDF_Third = pd.DataFrame(teamsDF_Third)
teamsDF_Third["3rd most common teammate"] = teamsDF_Third["pokemon"].astype(
    str) + " " + teamsDF_Third["percent"].astype(
        str) + "% " + teamsDF_Third["types"].astype(str)
teamsDF_Third = teamsDF_Third.drop(["pokemon", "percent", "types"], axis=1)

### Now I'll create new columns in the original dataset to add the information I wanted from the DF I cleaned separately

In [None]:
pikalyticsDF = pikalyticsDF.drop(["team"], axis=1)

In [None]:
pikalyticsDF["Most common teammates"] = teamsDF_First["Most common teammate"]
pikalyticsDF["2nd most common teammates"] = teamsDF_Second[
    "2nd most common teammate"]
pikalyticsDF["3rd most common teammates"] = teamsDF_Third[
    "3rd most common teammate"]

In [None]:
pikalyticsDF.head(5)

In [None]:
pikalyticsDF.loc[
    pikalyticsDF["Most common teammates"].isnull(),
    "Most common teammates"] = teamsDF_First["Most common teammate"]
pikalyticsDF.loc[
    pikalyticsDF["2nd most common teammates"].isnull(),
    "2nd most common teammates"] = teamsDF_Second["2nd most common teammate"]
pikalyticsDF.loc[
    pikalyticsDF["3rd most common teammates"].isnull(),
    "3rd most common teammates"] = teamsDF_Third["3rd most common teammate"]

### Here it is the clean pikalytics dataframe

In [None]:
pikalyticsDF.head()

## Working On The Web

### Getting the needed information from the web

In [None]:
url1 = "https://www.wikidex.net/wiki/Lista_de_Pok%C3%A9mon_con_sus_estad%C3%ADsticas_base"

In [None]:
html = requests.get(url1).content
soup = BeautifulSoup(html, "lxml")

### Cleaning the data we got and generating a dataframe

In [None]:
table = soup.find_all(
    "table", {"class": "tabpokemon sortable mergetable tablemanager"})[0]

In [None]:
rows = table.find_all("tr")
rows = [row.text.strip().split("\n") for row in rows]
rows = [list(filter(None, row)) for row in rows]
wikidexDF = pd.DataFrame(rows)

In [None]:
wikidexDF.head()

In [None]:
wikidexDF = wikidexDF.drop([11], axis=1)

In [None]:
wikidexDF = wikidexDF.drop([0], axis=0)
wikidexDF.head()

In [None]:
colnames = [
    "National Dex #", "Pokémon", "HP", "Attack", "Defense", "Special Attack",
    "Special Defense", "Speed", "Mean", "SD", "Total"
]

wikidexDF = pd.DataFrame(rows[1:], columns=colnames)

### Here it is the clean wikidex dataframe

In [None]:
wikidexDF.head()

## Results

In [None]:
pikalyticsDF.to_csv(
    "/Users/Miguel/Documents/GitHub/Ironhack exercises/Modulo 1/project-web/Your-code/Output/pikalyticsDF_clean.csv"
)
wikidexDF.to_csv(
    "/Users/Miguel/Documents/GitHub/Ironhack exercises/Modulo 1/project-web/Your-code/Output/wikidexDF_clean.csv"
)

# Bonus

## Combining Information

### Merging the resulting dataframes

In [None]:
Combined_DF = pd.merge(pikalyticsDF,
                       wikidexDF,
                       left_on="name",
                       right_on="Pokémon")

In [None]:
Combined_DF.head()

### Drop columns that add no information

In [None]:
Combined_DF.columns

In [None]:
Combined_DF = Combined_DF.drop(['name', 'Total'], axis=1)
Combined_DF.head()

### Reorder columns

In [None]:
column_order = [
    'National Dex #', 'Pokémon', 'types', 'HP', 'Attack', 'Defense',
    'Special Attack', 'Special Defense', 'Speed', 'Mean', 'SD', 'ranking',
    'percent', 'viability', 'Most common teammates',
    '2nd most common teammates', '3rd most common teammates'
]
Combined_DF = Combined_DF[column_order]
Combined_DF.head()

### Rename columns

In [None]:
Combined_DF.columns = [
    'National Dex #', 'Pokémon', 'Type', 'Base HP', 'Base Attack',
    'Base Defense', 'Base Special Attack', 'Base Special Defense',
    'Base Speed', 'Base Stats Mean', 'Base Stats SD', 'VGC 2020 Usage Rank',
    'VGC 2020 Usage %', 'Competitive viability', 'Most frequent teammate',
    '2nd most frequent teammate', '3rd most frequent teammate'
]

In [None]:
Combined_DF["Base HP"] = Combined_DF["Base HP"].astype("float64")
Combined_DF["Base Attack"] = Combined_DF["Base Attack"].astype("float64")
Combined_DF["Base Defense"] = Combined_DF["Base Defense"].astype("float64")
Combined_DF["Base Special Attack"] = Combined_DF["Base Special Attack"].astype(
    "float64")
Combined_DF["Base Special Defense"] = Combined_DF[
    "Base Special Defense"].astype("float64")
Combined_DF["Base Speed"] = Combined_DF["Base Speed"].astype("float64")

Combined_DF["Base Stats Mean"] = Combined_DF["Base Stats Mean"].str.replace(
    ',', '.').astype(float)
Combined_DF["Base Stats SD"] = Combined_DF["Base Stats SD"].str.replace(
    ',', '.').astype(float)

### This is the resulting combined dataframe

In [None]:
Combined_DF.to_csv(
    "/Users/Miguel/Documents/GitHub/Ironhack exercises/Modulo 1/project-web/Your-code/Output/Combined_DF_clean.csv"
)
Combined_Styler = Combined_DF.style.hide_index()

## Analyzing

### Look for outliers

In [None]:
# We will only consider lower outliers as the upper ones are basically VGC 2020 metagame's core
stats = Combined_DF.describe().transpose()
stats["IQR"] = stats["75%"] - stats["25%"]
outliers = pd.DataFrame(columns=Combined_DF.columns)
for col in stats.index:
    iqr = stats.at[col, 'IQR']
    cutoff = iqr * 1.5
    lower = stats.at[col, '25%'] - cutoff
    upper = stats.at[col, '75%'] + cutoff
    results = Combined_DF[(Combined_DF[col] < lower)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)

In [None]:
stats

### We clean and manipulate this new "outliers" dataframe 

In [None]:
outliers = outliers.drop([
    '2nd most frequent teammate', '3rd most frequent teammate',
    'Competitive viability', 'Most frequent teammate', 'National Dex #',
    'VGC 2020 Usage Rank'
],
                         axis=1)

In [None]:
column_order = [
    'Outlier', 'Pokémon', 'Type', 'Base Attack', 'Base Defense', 'Base HP',
    'Base Special Attack', 'Base Special Defense', 'Base Speed',
    'Base Stats Mean', 'Base Stats SD', 'VGC 2020 Usage %'
]
outliers = outliers[column_order]

### Lower outliers = Pokémon that are very rarely used in VGC and we should be careful with

In [None]:
outliers.to_csv(
    "/Users/Miguel/Documents/GitHub/Ironhack exercises/Modulo 1/project-web/Your-code/Output/outliers_clean.csv"
)
outliers.head()

### Correlation between base stats mean and usage

In [None]:
print(Combined_DF["VGC 2020 Usage %"].corr(Combined_DF["Base Stats Mean"]))
print(outliers["VGC 2020 Usage %"].corr(outliers["Base Stats Mean"]))

It seems that "lower" outliers are more depandant on their base stats mean compared to the population overall but there's little difference so wou shoul try to focus on other variables. I personally think it could be their battle roles, but this information was not in any of the datasets I worked with or that were at least discarded candidates. 