## **Logistic Regresion Model**

In [1]:
%load_ext autoreload
%autoreload 2

Before training the model, we'll split the data into training and test sets


In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
import os

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
os.chdir("projects/proj_1_team_1")

# **Load of the preprocessed dataframe from .csv file**

In [4]:
df = pd.read_csv("mushrooms_preprocessed.csv", index_col=0)

In [None]:
df.head()

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [7]:
X = df.drop(columns='poisonous')
y = df['poisonous']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)


In [None]:
logreg_pipeline = make_pipeline(LogisticRegression(max_iter=1000))
logreg_pipeline


In [9]:
logreg_pipeline.fit(X_train, y_train)
y_pred = logreg_pipeline.predict(X_test)


Let's evaluate the model

In [None]:
# Evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
scores = cross_val_score(logreg_pipeline, X_train, y_train, cv=10)

print("Cross-validation scores:", scores)
print("Mean cross-validation accuracy:", scores.mean())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion matrix heatmap
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Edible','Poisonous'], yticklabels=['Edible','Poisonous'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# **Learning curve**

In [13]:
train_sizes, train_scores, valid_scores = learning_curve( logreg_pipeline, X_train, y_train, cv=10, train_sizes=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], scoring='accuracy')

In [14]:
train_mean = train_scores.mean(axis=1)
valid_mean = valid_scores.mean(axis=1)

train_std = train_scores.std(axis=1)
valid_std = valid_scores.std(axis=1)

In [None]:
train_sizes_percent = (train_sizes / len(X_train)) * 100

plt.figure(figsize=(10, 6))
plt.plot(train_sizes_percent, train_mean, label='Training Accuracy', color='blue')
plt.plot(train_sizes_percent, valid_mean, label='Validation Accuracy', color='green')

plt.fill_between(train_sizes_percent, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.2)
plt.fill_between(train_sizes_percent, valid_mean - valid_std, valid_mean + valid_std, color='green', alpha=0.2)

plt.title('Learning Curve for Logistic Regression')
plt.xlabel('Training Size (%)')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.show()