# Load Packages

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from matplotlib.colors import ListedColormap
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
%matplotlib inline

# Load Dataset

In [7]:
pokeman = pd.read_csv('../input/pokemon/Pokemon.csv')
pokeman.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


# Exploratory Data Analysis

In [8]:
# Number of rows and columns in the dataset
pokeman.shape

(800, 13)

In [9]:
# Check is there any null values in the dataset
pokeman.isnull().sum()

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [10]:
# Drop # column, duplicate with index
pokeman = pokeman.drop(['#'], axis = 1)
pokeman.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [56]:
# Check whether Total = HP + Attack + Defense + Sp. Atk + Sp. Def + Speed
tmp = []
for i in range(len(pokeman)):
    if pokeman['Total'][i] == pokeman['HP'][i] + pokeman['Attack'][i] + pokeman['Defense'][i] + pokeman['Sp. Atk'][i] + pokeman['Sp. Def'][i] + pokeman['Speed'][i]:
        tmp.append('True')
    else:
        tmp.append('False')
tmp = pd.DataFrame(tmp)
tmp.columns = ['Result']
tmp['Result'].value_counts()

True    800
Name: Result, dtype: int64

In [63]:
# Correlation Heatmap
plt.subplots(figsize = (25, 25))
ax = plt.axes()
ax.set_title("Pokeman Correlation Heatmap")
corr = pokeman.corr()
sns.heatmap(corr,
            xticklabels = corr.columns.values,
            yticklabels = corr.columns.values)

<matplotlib.axes._subplots.AxesSubplot at 0x7f177e818198>

In [65]:
# Pairplot
sns.pairplot(pokeman, hue = 'Legendary')

<seaborn.axisgrid.PairGrid at 0x7f177c9468d0>

In [14]:
# Number of Pokemans in each Type 1
pokeman['Type 1'].value_counts()

Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Rock         44
Electric     44
Ground       32
Ghost        32
Dragon       32
Dark         31
Poison       28
Fighting     27
Steel        27
Ice          24
Fairy        17
Flying        4
Name: Type 1, dtype: int64

In [59]:
# Visualize Number of Pokemans in each Type 1
tmp = pokeman.groupby('Type 1')['Name'].count()
tmp = pd.DataFrame(tmp)
tmp = tmp.reset_index()
tmp.columns = ['Type 1', 'Count']
tmp = tmp.sort_values('Count', ascending = False)
sns.set(style = 'whitegrid')
sns.barplot(x = "Type 1", y = "Count", data = tmp)
plt.title("Number of Pokemans in each Type 1")
plt.xticks(rotation = 45)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17]), <a list of 18 Text xticklabel objects>)

In [60]:
# Fill NA in Type 2 column with Type 1 and calculate number of Pokemans in each Type 2
pokeman['Type 2'].fillna(pokeman['Type 1'], inplace=True)
pokeman['Type 2'].value_counts()

Flying      99
Water       73
Psychic     71
Normal      65
Grass       58
Poison      49
Ground      48
Fighting    46
Fire        40
Fairy       38
Electric    33
Dark        30
Dragon      29
Ice         27
Steel       27
Ghost       24
Rock        23
Bug         20
Name: Type 2, dtype: int64

In [61]:
# Visualize Number of Pokemans in each Type 2
tmp = pokeman.groupby('Type 2')['Name'].count()
tmp = pd.DataFrame(tmp)
tmp = tmp.reset_index()
tmp.columns = ['Type 2', 'Count']
tmp = tmp.sort_values('Count', ascending = False)
sns.set(style = 'whitegrid')
sns.barplot(x = "Type 2", y = "Count", data = tmp)
plt.title("Number of Pokemans in each Type 2")
plt.xticks(rotation = 45)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17]), <a list of 18 Text xticklabel objects>)

In [37]:
# Number of Pokemans in each generation
pokeman['Generation'].value_counts()

1    166
5    165
3    160
4    121
2    106
6     82
Name: Generation, dtype: int64

In [50]:
# Visualize the tendency of number of Pokemans in each generation
tmp = pokeman.groupby('Generation')['Name'].count()
tmp = tmp.sort_index()
tmp = pd.DataFrame(tmp)
tmp = tmp.reset_index()
tmp.columns = ['Generation', 'Count']
sns.set(style = 'whitegrid')
plt.plot(tmp['Generation'], tmp['Count'], '-ok')

[<matplotlib.lines.Line2D at 0x7f177e950518>]

In [48]:
# Visualize number of Pokemans in each Legendary
sns.set(style = 'whitegrid')
sns.countplot(x = 'Legendary', data = pokeman)

<matplotlib.axes._subplots.AxesSubplot at 0x7f177f400240>

In [90]:
# Calculate avgerage Total for Pokemans in each Type
tmp1 = pokeman.groupby(['Type 1', 'Legendary'])['Total'].mean()
tmp1 = pd.DataFrame(tmp1)
tmp1 = tmp1.reset_index()
tmp1.columns = ['Type 1', 'Legendary', 'Total1']
tmp2 = pokeman.groupby(['Type 2', 'Legendary'])['Total'].mean()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.reset_index()
tmp2.columns = ['Type 2', 'Legendary', 'Total2']
tmp = pd.merge(tmp1, tmp2,  how = 'inner', left_on = ['Type 1','Legendary'], right_on = ['Type 2','Legendary'])
tmp = tmp.drop(['Type 2'], axis = 1)
tmp['Total'] = tmp[['Total1', 'Total2']].mean(axis=1)
tmp = tmp.drop(['Total1', 'Total2'], axis = 1)
tmp.columns = ['Type', 'Legendary', 'Total']
sns.barplot(x = 'Type', y = 'Total', hue = 'Legendary', data = tmp)
plt.xticks(rotation = 45)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17]), <a list of 18 Text xticklabel objects>)

# Feature Engineering

In [98]:
# Label Encoder
final_pokeman = pokeman[['Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary']]
labelencoder=LabelEncoder()
final_pokeman['Type 1'] = labelencoder.fit_transform(final_pokeman['Type 1'])
final_pokeman['Type 2'] = labelencoder.fit_transform(final_pokeman['Type 2'])
final_pokeman.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,9,13,45,49,49,65,65,45,1,False
1,9,13,60,62,63,80,80,60,1,False
2,9,13,80,82,83,100,100,80,1,False
3,9,13,80,100,123,122,120,80,1,False
4,6,6,39,52,43,60,50,65,1,False


# Logistic Regression

In [122]:
# Determine independent variables and target variable
X = final_pokeman.iloc[:, 0:9].values
y = final_pokeman.iloc[:, 9].values

In [123]:
# Split the dataset into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [124]:
# Normalize the data
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)



In [125]:
# Reduce the dimensionality to 2
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [126]:
# Fitting Logistic Regression to the Training set
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [127]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [128]:
# Model Performance
def print_score(classifier,X_train,y_train,X_test,y_test):
    print("Test results:\n")
    print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,y_pred)))
    print('Classification Report:\n{}\n'.format(classification_report(y_test,y_pred)))
    print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,y_pred)))

In [129]:
print_score(classifier, X_train, y_train, X_test, y_test)

Test results:

Accuracy Score: 0.9250

Classification Report:
             precision    recall  f1-score   support

      False       0.93      0.99      0.96       181
       True       0.83      0.26      0.40        19

avg / total       0.92      0.93      0.91       200


Confusion Matrix:
[[180   1]
 [ 14   5]]



In [130]:
# ROC and AUC
y_pred_proba = classifier.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
sns.set(style = 'whitegrid')
plt.plot(fpr,tpr,label="Logistic Regression, AUC = "+str(auc))
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc=4)
plt.show()

In [113]:
# Performance of Logistic Regression Model
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j,0], X_set[y_set == j,1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Classifier (Test set)')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend()
plt.show()

'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.


# Decision Tree

In [131]:
X = final_pokeman.drop(['Legendary'], axis = 1)
y = final_pokeman['Legendary']

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [133]:
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)

In [134]:
# Visualize Decision Tree
visual = export_graphviz(classifier, 
                         out_file = None, 
                         feature_names = X.columns,  
                         filled = True, 
                         rounded=True,  
                         special_characters=True)  
decision_tree = graphviz.Source(visual)  
decision_tree

In [135]:
features = X.columns.values
feature_importance = classifier.feature_importances_
sorted_idx = np.argsort(feature_importance)
sns.set(style = 'whitegrid')
plt.figure(figsize=(5,7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()