In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings
import os
os.chdir(r'/workspaces/demo-repo/src/')

import prepare_data, train, preprocess

warnings.filterwarnings('ignore')
pd.options.display.float_format = "{:,.2f}".format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
df = pd.read_csv("/workspaces/demo-repo/data/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.63,50,1
1,1,85,66,29,0,26.6,0.35,31,0
2,8,183,64,0,0,23.3,0.67,32,1
3,1,89,66,23,94,28.1,0.17,21,0
4,0,137,40,35,168,43.1,2.29,33,1


In [7]:
df.shape

(768, 9)

In [8]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
config = prepare_data.load_config()

2023-08-25 22:38:07,856 - DEBUG - loading config.yml...


In [11]:
model = 'LogisticRegression'
ds_config = config['diabetes']

In [12]:
ds_config

{'name': 'diabetes',
 'input_file': {'name': 'diabetes.csv',
  'path': '/workspaces/demo-repo/data',
  'drop_nulls': ['outcome']},
 'target': 'outcome',
 'transform': {'glucose': 'MinMaxScaler', 'age': 'log'},
 'models': {'LogisticRegression': {'params': {'C': 1,
    'class_weight': 'balanced',
    'penalty': 'l1',
    'max_iter': 2000,
    'random_state': 42,
    'solver': 'liblinear'},
   'cv': {'penalty': ['l2', 'l1'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [2000, 5000, 10000],
    'class_weight': ['balanced', 'None'],
    'random_state': [42]}}}}

In [14]:
if 'params' in ds_config['models']['LogisticRegression']:
    print(ds_config['models']['LogisticRegression']['params'])

{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1', 'max_iter': 2000, 'random_state': 42, 'solver': 'liblinear'}


In [15]:
df = prepare_data.prepare(model, ds_config)
df.head()

2023-08-25 22:38:48,895 - DEBUG - loading dataset diabetes.csv...
2023-08-25 22:38:48,918 - DEBUG - convert all column names to lowercase...
2023-08-25 22:38:48,923 - DEBUG - dropping NaN from ['outcome']...


Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
0,6,148,72,35,0,33.6,0.63,50,1
1,1,85,66,29,0,26.6,0.35,31,0
2,8,183,64,0,0,23.3,0.67,32,1
3,1,89,66,23,94,28.1,0.17,21,0
4,0,137,40,35,168,43.1,2.29,33,1


In [16]:
if 'transform' in ds_config:
    df = preprocess.apply_preprocessing(df, model, ds_config)

2023-08-25 22:39:04,502 - DEBUG - performing MinMaxScaler transformation on glucose...
2023-08-25 22:39:04,509 - DEBUG - performing log transformation on age...


In [17]:
train.train_and_predict(df, model, ds_config)

2023-08-25 22:39:24,010 - DEBUG - loading prepared data for training...
2023-08-25 22:39:24,012 - DEBUG - splitting train and test...
2023-08-25 22:39:24,127 - DEBUG - accuracy for LogisticRegression(C=1, class_weight='balanced', max_iter=2000, penalty='l1',
                   random_state=42, solver='liblinear'): 0.8020833333333334
