In [1]:
import numpy as np
import pandas as pd
import openml
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

In [3]:
import pickle
with open('data_list.pkl', 'rb') as f:
    data_list = pickle.load(f)


In [6]:
X, y = data_list[0]

In [37]:
from lr import LogisticRegression
from sklearn.model_selection import train_test_split
from optimizers import Adam
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
optimizer = Adam(learning_rate=0.0001, beta1=0.9, beta2=0.999)
model = LogisticRegression(optimizer=optimizer, epochs=400)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

ImportError: cannot import name 'Adam' from 'optimizers' (unknown location)

In [21]:
from sklearn.linear_model import LogisticRegression as LR
model = LR()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.7467532467532467


## Datasets

Small datasets:
- 37: diabetes
- 1462: banknote-authentication
- 871: pollen

Large datasets:
- 752: puma32H
- 1120: MagicTelescope
- 23512: higgs
- 23517: numerai28.6
- 979: waveform-5000
- 1487: ozone-level-8hr

In [69]:
def read_data(id):
    dataset = openml.datasets.get_dataset(id)
    df, _, _, _ = dataset.get_data(dataset_format="dataframe")
    numerical_cols = df.select_dtypes(include='number').columns
    target_col = df.select_dtypes(exclude='number').columns
    X = df[numerical_cols].to_numpy()
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df[target_col])
    return X, y

def list_data(id_list):
    data_list = []
    for id in id_list:
        data_list.append(read_data(id))
    return data_list

In [70]:
id_list = [37, 1462, 871, 752, 1120, 23512, 23517, 979, 1487] #small datasets first
data_list = list_data(id_list)

## Preprocessing

Missing values imputation

In [72]:
for i, (X, y) in enumerate(data_list):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    missing_X = X.isnull().sum().sum()
    missing_y = y.isnull().sum().sum()
    
    if missing_X > 0 or missing_y > 0:
        print(f"Dataset {i+1} has missing values:")
        print(f"Missing values in X: {missing_X}")
        print(f"Missing values in y: {missing_y}")
        X = X.fillna(X.mean())
        y = y.fillna(y.mean())
        data_list[i] = (X.to_numpy(), (y.to_numpy()).flatten())
    else:
        print(f"Dataset {i+1} has no missing values.")

Dataset 1 has no missing values.
Dataset 2 has no missing values.
Dataset 3 has no missing values.
Dataset 4 has no missing values.
Dataset 5 has no missing values.
Dataset 6 has missing values:
Missing values in X: 9
Missing values in y: 0
Dataset 7 has no missing values.
Dataset 8 has no missing values.
Dataset 9 has no missing values.


Removing highly correlated columns

In [73]:
def remove_cols(df, threshold=0.8):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    df.drop(columns=to_drop, inplace=True)
    return df

In [74]:
for i, (X, y) in enumerate(data_list):
    print(f"Checking dataset {i+1} for highly correlated columns...")
    X = pd.DataFrame(X)
    y = pd.DataFrame(y, columns=['y'])
    data = pd.concat([X, y], axis=1)

    X_cleaned = remove_cols(data.drop(columns=['y']), threshold=0.8)
    y_cleaned = data['y']
    
    data_list[i] = (X_cleaned.to_numpy(), (y_cleaned.to_numpy()).flatten())
    print(f"{X.shape[1] - X_cleaned.shape[1]} highly correlated columns removed.")

Checking dataset 1 for highly correlated columns...
0 highly correlated columns removed.
Checking dataset 2 for highly correlated columns...
0 highly correlated columns removed.
Checking dataset 3 for highly correlated columns...
1 highly correlated columns removed.
Checking dataset 4 for highly correlated columns...
0 highly correlated columns removed.
Checking dataset 5 for highly correlated columns...
2 highly correlated columns removed.
Checking dataset 6 for highly correlated columns...
1 highly correlated columns removed.
Checking dataset 7 for highly correlated columns...
8 highly correlated columns removed.
Checking dataset 8 for highly correlated columns...
0 highly correlated columns removed.
Checking dataset 9 for highly correlated columns...
57 highly correlated columns removed.
