In [4]:
import pandas as pd

# Go up one level to access the data folder
df = pd.read_csv("../data/pokemon_data.csv")

df.head()

Unnamed: 0,id,name,base_experience,height,weight,types,abilities,moves,stats
0,1,bulbasaur,64,7,69,"grass, poison","overgrow, chlorophyll","razor-wind, swords-dance, cut, bind, vine-whip","hp=45, attack=49, defense=49, special-attack=6..."
1,2,ivysaur,142,10,130,"grass, poison","overgrow, chlorophyll","swords-dance, cut, bind, vine-whip, headbutt","hp=60, attack=62, defense=63, special-attack=8..."
2,3,venusaur,236,20,1000,"grass, poison","overgrow, chlorophyll","swords-dance, cut, bind, vine-whip, headbutt","hp=80, attack=82, defense=83, special-attack=1..."
3,4,charmander,62,6,85,fire,"blaze, solar-power","mega-punch, fire-punch, thunder-punch, scratch...","hp=39, attack=52, defense=43, special-attack=6..."
4,5,charmeleon,142,11,190,fire,"blaze, solar-power","mega-punch, fire-punch, thunder-punch, scratch...","hp=58, attack=64, defense=58, special-attack=8..."


In [5]:
# Basic overview
print("Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nUnique Values:\n", df.nunique())
print("\nStatistical Summary (Numerical):\n", df.describe())

Dataset Shape: (1302, 9)

Data Types:
 id                  int64
name               object
base_experience     int64
height              int64
weight              int64
types              object
abilities          object
moves              object
stats              object
dtype: object

Missing Values:
 id                  0
name                0
base_experience     0
height              0
weight              0
types               0
abilities           0
moves              34
stats               0
dtype: int64

Unique Values:
 id                 1302
name               1302
base_experience     201
height               85
weight              533
types               221
abilities           707
moves               812
stats              1142
dtype: int64

Statistical Summary (Numerical):
                  id  base_experience       height        weight
count   1302.000000      1302.000000  1302.000000   1302.000000
mean    2560.927803       161.943932    20.467742    980.863287
std     394

In [14]:
from collections import Counter

df['primary_type'] = df['types'].apply(lambda x: x.split(',')[0].strip())
type_combinations = df['primary_type'].value_counts(dropna=False)
print("Counts by unique 'types' value:\n", type_combinations)

Counts by unique 'types' value:
 primary_type
water       157
normal      138
grass       118
electric     96
bug          93
psychic      83
rock         81
fire         80
dark         59
fighting     57
dragon       52
ghost        51
poison       51
steel        49
ground       48
ice          43
fairy        33
flying       13
Name: count, dtype: int64


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# 2. Encode target labels
le = LabelEncoder()
df['primary_type_encoded'] = le.fit_transform(df['primary_type'])
df

Unnamed: 0,id,name,base_experience,height,weight,types,abilities,moves,stats,primary_type,primary_type_encoded
0,1,bulbasaur,64,7,69,"grass, poison","overgrow, chlorophyll","razor-wind, swords-dance, cut, bind, vine-whip","hp=45, attack=49, defense=49, special-attack=6...",grass,9
1,2,ivysaur,142,10,130,"grass, poison","overgrow, chlorophyll","swords-dance, cut, bind, vine-whip, headbutt","hp=60, attack=62, defense=63, special-attack=8...",grass,9
2,3,venusaur,236,20,1000,"grass, poison","overgrow, chlorophyll","swords-dance, cut, bind, vine-whip, headbutt","hp=80, attack=82, defense=83, special-attack=1...",grass,9
3,4,charmander,62,6,85,fire,"blaze, solar-power","mega-punch, fire-punch, thunder-punch, scratch...","hp=39, attack=52, defense=43, special-attack=6...",fire,6
4,5,charmeleon,142,11,190,fire,"blaze, solar-power","mega-punch, fire-punch, thunder-punch, scratch...","hp=58, attack=64, defense=58, special-attack=8...",fire,6
...,...,...,...,...,...,...,...,...,...,...,...
1297,10273,ogerpon-wellspring-mask,275,12,398,"grass, water",water-absorb,"swords-dance, slam, vine-whip, double-kick, ta...","hp=80, attack=120, defense=84, special-attack=...",grass,9
1298,10274,ogerpon-hearthflame-mask,275,12,398,"grass, fire",mold-breaker,"swords-dance, slam, vine-whip, double-kick, ta...","hp=80, attack=120, defense=84, special-attack=...",grass,9
1299,10275,ogerpon-cornerstone-mask,275,12,398,"grass, rock",sturdy,"swords-dance, slam, vine-whip, double-kick, ta...","hp=80, attack=120, defense=84, special-attack=...",grass,9
1300,10276,terapagos-terastal,90,3,160,normal,tera-shell,"headbutt, body-slam, take-down, double-edge, roar","hp=95, attack=95, defense=110, special-attack=...",normal,12


In [20]:
# 3. Define features and target
features = ['base_experience', 'height', 'weight']
X = df[features].dropna()
y = df.loc[X.index, 'primary_type_encoded']

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 5. Fit logistic regression (include all parameters, even if default)
model = LogisticRegression(
    penalty='l2',             # Regularization type (default)
    dual=False,               # Use dual formulation (only for l2 + liblinear on small data)
    tol=1e-4,                 # Tolerance for stopping criteria
    C=1.0,                    # Inverse of regularization strength
    fit_intercept=True,       # Whether to add intercept
    intercept_scaling=1,      # Only used if solver='liblinear' and fit_intercept=True
    class_weight=None,        # No class weighting
    random_state=None,        # No fixed random state
    solver='lbfgs',           # Solver for optimization
    max_iter=100,             # Max number of iterations
    multi_class='auto',       # Auto = ovr if binary, multinomial if multiclass + solver supports
    verbose=0,                # No logging
    warm_start=False,         # Do not reuse solutions from previous fits
    n_jobs=None,              # Use 1 CPU core
    l1_ratio=None             # Not used unless penalty='elasticnet'
)

model.fit(X_train, y_train)

# 6. Evaluate model
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

         bug       0.00      0.00      0.00        28
        dark       0.00      0.00      0.00        18
      dragon       0.00      0.00      0.00        16
    electric       0.12      0.48      0.20        29
       fairy       0.00      0.00      0.00        10
    fighting       0.00      0.00      0.00        17
        fire       0.00      0.00      0.00        24
      flying       0.00      0.00      0.00         4
       ghost       0.00      0.00      0.00        15
       grass       0.00      0.00      0.00        35
      ground       0.00      0.00      0.00        14
         ice       0.00      0.00      0.00        13
      normal       0.00      0.00      0.00        42
      poison       0.00      0.00      0.00        15
     psychic       0.00      0.00      0.00        25
        rock       0.25      0.17      0.20        24
       steel       0.17      0.20      0.18        15
    

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
