In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

In [13]:
import random

In [14]:
random.seed(42)
np.random.seed(42)

In [23]:
import pandas as pd
import numpy as np

with open("data/arrhythmia.data", "r") as f:
    lines = f.readlines()

data = []
for line in lines:
    if line.strip():
        values = line.strip().split(',')
        data.append(values)

df = pd.DataFrame(data)

column_names = list(range(len(df.columns)))
df.columns = [str(i) for i in column_names]

df = df.rename(columns={str(len(df.columns)-1): 'target'})

df.to_csv("data/arrhythmia.csv", index=False)

print(f"Created arrhythmia.csv with shape: {df.shape}")
print(f"Column names: {list(df.columns)}")
print(f"First few rows:")
print(df.head(3))
print(f"Target value distribution:\n{df['target'].value_counts().sort_index()}")

print(f"Sum of NaN values in table: {df.isnull().sum().sum()}")

# Check for '?' values
def check_for_question_marks(df):
    question_mark_count = (df == '?').sum().sum()
    return question_mark_count
question_mark_count = check_for_question_marks(df)
print(f"Sum of '?' values in table: {question_mark_count}")

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)
print(f"Sum of NaN values in table after replacing '?': {df.isnull().sum().sum()}")

Created arrhythmia.csv with shape: (452, 280)
Column names: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149',

In [24]:
# Convert target to binary: 0 for normal (class 1), 1 for arrhythmia (classes 2-16)
df['target'] = df['target'].apply(lambda x: 0 if x == '1' else 1)

# Replace '?' with NaN to properly track missing values
df = df.replace('?', np.nan)

# Save to CSV file with binary target
df.to_csv("data/arrhythmia.csv", index=False)

print(f"Created arrhythmia.csv with shape: {df.shape}")
print(f"Column names: {list(df.columns)}")
print(f"First few rows:")
print(df.head(3))
print(f"Binary target distribution:\n{df['target'].value_counts().sort_index()}")
print(f"Sum of NaN values in table: {df.isnull().sum().sum()}")

Created arrhythmia.csv with shape: (452, 280)
Column names: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149',

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

ionosphere = pd.read_csv("data/ionosphere.data", header=None)

print(f"Original data shape: {ionosphere.shape}")

target_col = ionosphere.columns[-1]
class_counts = ionosphere[target_col].value_counts()
print(f"Class distribution:\n{class_counts}")

class_values = class_counts.index.tolist()
balanced_df = pd.DataFrame()

for cls in class_values:
    class_samples = ionosphere[ionosphere[target_col] == cls].sample(n=35, random_state=42)
    balanced_df = pd.concat([balanced_df, class_samples])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

column_names = [i for i in range(balanced_df.shape[1] - 1)] + ['target']
balanced_df.columns = column_names

if balanced_df['target'].dtype == object:

    label_mapping = {label: i for i, label in enumerate(balanced_df['target'].unique())}
    balanced_df['target'] = balanced_df['target'].map(label_mapping)
    print(f"Label mapping: {label_mapping}")

balanced_df.to_csv("data/ionosphere_balanced.csv", index=False)

print(f"Saved balanced dataset with shape: {balanced_df.shape}")
print(f"New class distribution:\n{balanced_df['target'].value_counts()}")

Original data shape: (351, 35)
Class distribution:
34
g    225
b    126
Name: count, dtype: int64
Label mapping: {'g': 0, 'b': 1}
Saved balanced dataset with shape: (70, 35)
New class distribution:
target
0    35
1    35
Name: count, dtype: int64
