In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
filename = "regression.csv"

In [4]:
df = pd.read_csv(filename, sep=";", quoting=2, encoding="utf-8")
df.head()

Unnamed: 0,scientificName_y,redlistCategory,region_count,enumRedListCategory
0,Piper amalago,Least Concern,35.0,0.0
1,Piper aduncum,Least Concern,32.0,0.0
2,Piper hispidum,Least Concern,27.0,0.0
3,Piper tuberculatum,Least Concern,26.0,0.0
4,Piper dilatatum,Least Concern,25.0,0.0


In [5]:
X = df[["enumRedListCategory"]]
y = df[["region_count"]]

In [6]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
skf.get_n_splits(X, y)

5

In [8]:
mse_scores = []
r2_scores = []
rocs = []

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f" Fold {fold}:")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(f"  X_train shape: {X_train.shape}")
    print(f"  X_test  shape: {X_test.shape}")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    reg = LinearRegression().fit(X_train, y_train)

    scaler = StandardScaler()
    X_test = scaler.fit_transform(X_test)
    y_pred = reg.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_scores.append(mse)
    r2_scores.append(r2)

    print(f"Fold {fold+1}: MSE = {mse}, R*R = {r2}")

 Fold 0:
  X_train shape: (80, 1)
  X_test  shape: (21, 1)
Fold 1: MSE = 58.35452822503945, R*R = 0.18448640679292705
 Fold 1:
  X_train shape: (81, 1)
  X_test  shape: (20, 1)
Fold 2: MSE = 50.546167319820505, R*R = 0.20786448331263907
 Fold 2:
  X_train shape: (81, 1)
  X_test  shape: (20, 1)
Fold 3: MSE = 35.687719886780734, R*R = 0.28424147840391634
 Fold 3:
  X_train shape: (81, 1)
  X_test  shape: (20, 1)
Fold 4: MSE = 30.49864244395085, R*R = 0.15889016977521098
 Fold 4:
  X_train shape: (81, 1)
  X_test  shape: (20, 1)
Fold 5: MSE = 26.053177215201124, R*R = 0.19089511754033783




In [9]:
print(f"Média MSE: {np.mean(mse_scores)} (+/- {np.std(mse_scores)})")
print(f"Média R²: {np.mean(r2_scores)} (+/- {np.std(r2_scores)})")

Média MSE: 40.228047018158534 (+/- 12.257534379003868)
Média R²: 0.20527553116500624 (+/- 0.0425039039053998)
