In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
import random

In [3]:
random.seed(42)
np.random.seed(42)

In [4]:
# Arrhythmia

def load_and_prepare_data(filepath):
    data = pd.read_csv(filepath)
    
    X = data.drop(columns=['target'])
    y = data['target']
    
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    return X_scaled, y

arrhythmia_path = 'data/arrhythmia.csv'
dexter_path = 'data/dexter.csv'

X_arrhythmia, y_arrhythmia = load_and_prepare_data(arrhythmia_path)
X_dexter, y_dexter = load_and_prepare_data(dexter_path)

def train_and_evaluate(X, y, dataset_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    
    # Without regularization
    model_no_reg = LogisticRegression(penalty=None, max_iter=1000)
    model_no_reg.fit(X_train, y_train)
    y_pred_no_reg = model_no_reg.predict(X_test)
    
    print(f"\n=== Results for {dataset_name} - without regularization ===")
    print(classification_report(y_test, y_pred_no_reg))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred_no_reg))
    
    # With L2 regularization
    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    grid = GridSearchCV(LogisticRegression(max_iter=1000, penalty='l2'), param_grid, cv=5)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred_reg = best_model.predict(X_test)
    
    print(f"\n=== Results for {dataset_name} - with L2 regularization ===")
    print("Best C parameter:", grid.best_params_['C'])
    print(classification_report(y_test, y_pred_reg))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred_reg))

train_and_evaluate(X_arrhythmia, y_arrhythmia, "Arrhythmia")
train_and_evaluate(X_dexter, y_dexter, "Dexter")



=== Results for Arrhythmia - without regularization ===
              precision    recall  f1-score   support

           0       0.76      0.72      0.74        61
           1       0.69      0.73      0.71        52

    accuracy                           0.73       113
   macro avg       0.72      0.73      0.72       113
weighted avg       0.73      0.73      0.73       113

Confusion matrix:
[[44 17]
 [14 38]]

=== Results for Arrhythmia - with L2 regularization ===
Best C parameter: 0.1
              precision    recall  f1-score   support

           0       0.81      0.85      0.83        61
           1       0.82      0.77      0.79        52

    accuracy                           0.81       113
   macro avg       0.81      0.81      0.81       113
weighted avg       0.81      0.81      0.81       113

Confusion matrix:
[[52  9]
 [12 40]]

=== Results for Dexter - without regularization ===
              precision    recall  f1-score   support

          -1       0.97     

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

ionosphere = pd.read_csv("data/ionosphere.data", header=None)

print(f"Original data shape: {ionosphere.shape}")

target_col = ionosphere.columns[-1]
class_counts = ionosphere[target_col].value_counts()
print(f"Class distribution:\n{class_counts}")

class_values = class_counts.index.tolist()
balanced_df = pd.DataFrame()

for cls in class_values:
    class_samples = ionosphere[ionosphere[target_col] == cls].sample(n=35, random_state=42)
    balanced_df = pd.concat([balanced_df, class_samples])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

column_names = [i for i in range(balanced_df.shape[1] - 1)] + ['target']
balanced_df.columns = column_names

if balanced_df['target'].dtype == object:

    label_mapping = {label: i for i, label in enumerate(balanced_df['target'].unique())}
    balanced_df['target'] = balanced_df['target'].map(label_mapping)
    print(f"Label mapping: {label_mapping}")

balanced_df.to_csv("data/ionosphere_balanced.csv", index=False)

print(f"Saved balanced dataset with shape: {balanced_df.shape}")
print(f"New class distribution:\n{balanced_df['target'].value_counts()}")

Original data shape: (351, 35)
Class distribution:
34
g    225
b    126
Name: count, dtype: int64
Label mapping: {'g': 0, 'b': 1}
Saved balanced dataset with shape: (70, 35)
New class distribution:
target
0    35
1    35
Name: count, dtype: int64
