<a href="https://colab.research.google.com/github/taufiqbashori/for_references/blob/main/Loading_Pokemon_Dataset_from_Kaggle_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prepare libraries
import zipfile
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files
# files.upload() # upload json files for the api

Mounted at /content/drive


In [2]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/path/to/kaggle.json ~/.kaggle/

In [3]:
#!pip install kaggle
!kaggle datasets download -d abcsds/pokemon -p /content/drive/MyDrive/path/kaggle_datasets

with zipfile.ZipFile("/content/drive/MyDrive/path/kaggle_datasets/pokemon.zip", "r") as zip_ref:
    zip_ref.extractall("/content/drive/MyDrive/path/kaggle_datasets/pokemon")

pokemon.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
pokemon_df = pd.read_csv("/content/drive/MyDrive/path/kaggle_datasets/pokemon/Pokemon.csv")
pokemon_df

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


KNN for Pokemon Dataset

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score

def load_pokemon_data():
    pokemon_data = pd.read_csv("/content/drive/MyDrive/path/kaggle_datasets/pokemon/Pokemon.csv")
    pokemon_data['Type 1'] = pokemon_data['Type 1'].astype('category').cat.codes
    pokemon_data['Type 2'] = pokemon_data['Type 2'].astype('category').cat.codes
    return pokemon_data

def knn_classifier_pipeline(X_train, y_train, X_test, y_test, k):
    """
    KNN Classifier pipeline
    """
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('knn', KNeighborsClassifier(n_neighbors=k))])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"KNN Classifier (k={k}):")
    print(f"Accuracy Score: {accuracy}")
    print(classification_report(y_test, y_pred))
    
    return accuracy


def logistic_regression_pipeline(X_train, y_train, X_test, y_test):
    """
    Logistic Regression pipeline
    """
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('logistic_reg', LogisticRegression(max_iter=1000))])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Logistic Regression:")
    print(f"Accuracy Score: {accuracy}")
    print(classification_report(y_test, y_pred))
    
    return pipe['logistic_reg'], accuracy


def knn_regressor_pipeline(X_train, y_train, X_test, y_test, k):
    """
    KNN Regressor pipeline
    """
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('knn', KNeighborsRegressor(n_neighbors=k))])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print(f"KNN Regressor (k={k}):")
    print(f"Mean Squared Error: {mse}")
    
    return mse


def linear_regression_pipeline(X_train, y_train, X_test, y_test):
    """
    Linear Regression pipeline
    """
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('linear_reg', LinearRegression())])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print("Linear Regression:")
    print(f"Mean Squared Error: {mse}")
    
    return pipe['linear_reg'], mse


def feature_importance(model, feature_names):
    """
    Calculate feature importance for Linear Regression and Logistic Regression models
    """
    if isinstance(model, LinearRegression):
        importance = model.coef_
    elif isinstance(model, LogisticRegression):
        importance = model.coef_[0]

    for name, importance in zip(feature_names, importance):
        print(f"{name}: {importance}")


def main():
    pokemon_data = load_pokemon_data()
    X = pokemon_data.drop(columns=['Legendary', 'Name', 'Total', '#'])

    y_classifier = pokemon_data['Legendary']
    y_regressor = pokemon_data['Total']

    # Prepare data for Classifier and Regressor
    X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X, y_classifier, random_state=0)
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_regressor, random_state=0)


    # Use KNN Classifier
    knn_classifier_accuracy = knn_classifier_pipeline(X_train_cls, y_train_cls, X_test_cls, y_test_cls, k=5)

    # Use Logistic Regression
    logistic_reg, logistic_accuracy = logistic_regression_pipeline(X_train_cls, y_train_cls, X_test_cls, y_test_cls)

    # Use KNN Regressor
    knn_regressor_mse = knn_regressor_pipeline(X_train_reg, y_train_reg, X_test_reg, y_test_reg, k=5)

    # Use Linear Regression
    linear_reg, linear_mse = linear_regression_pipeline(X_train_reg, y_train_reg, X_test_reg, y_test_reg)

    # Feature Importance Analysis
    print("\nFeature Importance Analysis:")
    if knn_classifier_accuracy > logistic_accuracy:
        print("KNN Classifier doesn't provide feature importance.")
    else:
        print("Logistic Regression:")
        feature_importance(logistic_reg, X.columns)

    if knn_regressor_mse < linear_mse:
        print("KNN Regressor doesn't provide feature importance.")
    else:
        print("Linear Regression:")
        feature_importance(linear_reg, X.columns)


if __name__ == "__main__":
    main()

KNN Classifier (k=5):
Accuracy Score: 0.93
              precision    recall  f1-score   support

       False       0.94      0.98      0.96       182
        True       0.70      0.39      0.50        18

    accuracy                           0.93       200
   macro avg       0.82      0.69      0.73       200
weighted avg       0.92      0.93      0.92       200

Logistic Regression:
Accuracy Score: 0.93
              precision    recall  f1-score   support

       False       0.95      0.97      0.96       182
        True       0.64      0.50      0.56        18

    accuracy                           0.93       200
   macro avg       0.80      0.74      0.76       200
weighted avg       0.92      0.93      0.93       200

KNN Regressor (k=5):
Mean Squared Error: 739.4268000000002
Linear Regression:
Mean Squared Error: 1.0311484882069725e-26

Feature Importance Analysis:
Logistic Regression:
Type 1: 0.18082489054619724
Type 2: -0.3109876668663668
HP: 0.5113139676383843
Attack: 0.