In [1]:
# Numpy is numerical python. 
# Pandas is data processing, CSV file I/O (e.g. pd.read_csv)
# Matplotlib and Seaborn is data visualization
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Import the dataset
pokemon = pd.read_csv("Inputs/pokemon.csv")

FileNotFoundError: [Errno 2] File Inputs/pokemon.csv does not exist: 'Inputs/pokemon.csv'

In [None]:
pokemon

In [None]:
# How to make dataframe
d = {'col1' : [1,2], 'col2' : [3,4]}
df = pd.DataFrame(d)
df

In [None]:
# rename the column with a pound isng/hashtag as "number" its name
# The reason for this is when we try and access this column later it will comment out the code
pokemon = pokemon.rename(index=str, columns={"#": "Number"})
pokemon

In [None]:
combat = pd.read_csv("Inputs/combats.csv")

In [None]:
combat

In [None]:
print("Dimensions of pokemon:" + str(pokemon.shape))
print("Dimensions of Combat: " + str(combat.shape))

In [None]:
# isna() = isnull(): Whether the data value is null
pokemon.isna()

In [None]:
pokemon.isnull().sum()  

In [None]:
#Find the missing pokemon
pokemon[pokemon['Name'].isnull()]

In [None]:
print("This pokemon is before the missing pokemon: "+ pokemon['Name'][61])
print("This pokemon is after the missing pokemon: "+ pokemon['Name'][63])

In [None]:
# Update the missing pokemon
pokemon['Name'][62] = "Primeape"

In [None]:
# Index is the matchup order and the number on the right represents the winner of the matchup
combat_winner = combat['Winner']
combat_winner #series

In [None]:
#iloc: Selecting the rows with index. Combat matchup results
combat.iloc[1]

In [None]:
#Counting the total number of wins of each pokemons
total_Wins = combat_winner.value_counts()
total_Wins

In [None]:
# Finding the pokemon number, 163
pokemon.iloc[[162]]

In [None]:
combat.groupby('Winner')

In [None]:
#Finding the number of wins of each pokemon
numberOfWins = combat.groupby('Winner').count()
numberOfWins

In [None]:
# Counting the number of battles for each pokemon as a "First" pokemon. 
countbyFirst = combat.groupby('First_pokemon').count()
# Counting the number of battles for each pokemon as a "Second" pokemon. 
countbySecond = combat.groupby('Second_pokemon').count()

In [None]:
countbyFirst

In [None]:
countbySecond

In [None]:
countbyFirst.index

In [None]:
# Make the union set of countByFist.index & countBySecond.index
union_index = countbyFirst.index.join(countbySecond.index, how = "outer")
union_index

In [None]:
# finding the pokemon which is always losing
# This pokemon is not included in the pokemons which has at least one more battle
find_losing_pokemon = np.setdiff1d(union_index, numberOfWins.index)-1
find_losing_pokemon

In [None]:
# Find the pokemon which always lost
always_losing_pokemon = pokemon.iloc[find_losing_pokemon]
always_losing_pokemon

In [None]:
#Adding the total fights and win percentage to the pokemon data. 
numberOfWins['Total Fights'] = countbyFirst['Winner'] + countbySecond['Winner']
numberOfWins['Win Percentage'] = numberOfWins['First_pokemon']/numberOfWins['Total Fights']

In [None]:
numberOfWins

In [None]:
numberOfWins.loc[[154]]

In [None]:
# Merging the numberOfWins with the original pokemon dataset with the right index. 
results = pd.merge(pokemon, numberOfWins, left_on='Number', right_index = True, how = 'left')
results

In [None]:
# Can also merge with a common part = left : 'Number' , right :'Winner'
results2 = pd.merge(pokemon, numberOfWins, left_on='Number', right_on = 'Winner', how = 'left')
results2

In [None]:
results2.iloc[[153]]

In [None]:
# Pokemons without battle except shuckle. 
# Shuckle had battle records, but with zero wins. 
No_win_percentage = results[results['Win Percentage'].isnull()]
No_win_percentage

In [None]:
# Top 10 highest winning percentage pokemons (sorted by the winning percentages of each pokemon in a descending order)
results2[np.isfinite(results2['Win Percentage'])].sort_values(by =  ['Win Percentage'], ascending = False).head(10)

In [None]:
# Top 10 lowest winning percentage pokemons (sorted by the winning percentage in ascending order)
results2[np.isfinite(results2['Win Percentage'])].sort_values(by =  ['Win Percentage']).head(10)

In [None]:
# The basic statistics of each variable such as mean, max, std, etc.
# Describes the range and the average of these values we are working with. 
results2.describe()

In [None]:
# Countplot: shows the counts of observations for each categor
# With given hue, separate the counts of each category for its hue.
# hue : type 1

sns.set_color_codes("pastel")
ax = sns.countplot(x="Type 1", hue = "Legendary", data=results2)
plt.xticks(rotation=90)
plt.xlabel('Type 1')
plt.ylabel('Total')
plt.title("Total Pokemon by Type 1")

In [None]:
sns.set_color_codes("pastel")
ax = sns.countplot(x="Type 2", hue = "Legendary", data=results2)
plt.xticks(rotation=90)
plt.xlabel('Type 2')
plt.xlabel('Total')
plt.title("Total Pokemon by Type 1")

In [None]:
# Finding the average win percentage of the pokemons included in Type 1 with each Type. 
results2.groupby('Type 1').agg({"Win Percentage": "mean"}).sort_values(by = "Win Percentage")

In [None]:
results2.groupby('Type 2').agg({"Win Percentage": "mean"}).sort_values(by = "Win Percentage")

In [None]:
# distplot: Shows both histogram: rug and kernel density function
# Finding the histogram and kernel density estimate of the win percentages.
sns.distplot(results2["Win Percentage"].dropna(), kde = True, bins=10)

In [None]:
sns.distplot(results2["Win Percentage"].dropna(), kde = True, bins=20)

In [None]:
# pairplot : scatter plot for the different variables, histogram for same variable (diagonal)
# Find the pairplot for given columns (features).
# We can figure out the correlations of each column variable. 
col = ['Type 1', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Win Percentage']
sns.pairplot(results2.loc[:,col].dropna())

In [None]:
# Pairgrid is similar to pairplot, but can choose the graph of diagonal, lower triangle, upper triangle, of the matrix.
# Find the pairgrid for given col. 
g = sns.PairGrid(results2.loc[:, col],diag_sharey=False)
g.map_lower(sns.kdeplot,cmap="Blues_d")
g.map_upper(sns.regplot)
g.map_diag(sns.kdeplot, lw =3)


In [None]:
# Precise correlation values for given col.
results2.loc[:,col].corr()

In [None]:
# Find the correlation matrix cmap.
# https://datascience.stackexchange.com/questions/10459/calculation-and-visualization-of-correlation-matrix-with-pandas
# This function was taken from the link above

def correlation_matrix(df):
    import matplotlib.cm as cm
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 50)
    cax = ax1.imshow(df.corr(), interpolation = "nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Pokemon Feature Correlation')
    labels = ['Type1', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp.. Def', 'Speed', 'Win %']
    ax1.set_xticklabels(labels, fontsize = 7)
    ax1.set_yticklabels(labels, fontsize = 7)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=[0.00, 0.05, 0.10, 0.15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95, 1])
    plt.show()

correlation_matrix(results.loc[:, col])

In [None]:
# Find features with the greatest correlations (Speed, Win percentage)
sns.regplot(x = 'Speed', y = 'Win Percentage', data = results2, logistic = True).set_title("Speed vs Win Percentage")
# Find features with the greatest correlations (Speed, Win percentage) for each hue (type 1)
sns.lmplot(x = 'Speed', y = 'Win Percentage', data = results2, hue = 'Type 1', logistic = True)

In [None]:
# Find features with the second greatest correlations (Attack, Win percentage)
sns.regplot(x = "Attack", y = "Win Percentage", data = results).set_title("Attack vs Win Percentage")
# Find features with the second greatest correlations (Attack, Win Percentage) for each hue (type 2).
sns.lmplot(x = 'Attack', y = 'Win Percentage', data = results, hue = 'Type 1', fit_reg = False)