In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
! pip install -i https://test.pypi.org/simple/ my-krml-149874

Looking in indexes: https://test.pypi.org/simple/
Collecting my-krml-149874
  Downloading https://test-files.pythonhosted.org/packages/5f/03/69446b8f6ea157599ad7a5735fbcbd41cc5134d1ac2e4ae1aec23a739aca/my_krml_149874-0.1.10-py3-none-any.whl (17 kB)
Installing collected packages: my-krml-149874
Successfully installed my-krml-149874-0.1.10


In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('../data/raw/archive.zip')

In [5]:
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [6]:
df.shape

(308854, 19)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [8]:
df.describe()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0
mean,170.615249,83.588655,28.626211,5.096366,29.8352,15.110441,6.296616
std,10.658026,21.34321,6.522323,8.199763,24.875735,14.926238,8.582954
min,91.0,24.95,12.02,0.0,0.0,0.0,0.0
25%,163.0,68.04,24.21,0.0,12.0,4.0,2.0
50%,170.0,81.65,27.44,1.0,30.0,12.0,4.0
75%,178.0,95.25,31.85,6.0,30.0,20.0,8.0
max,241.0,293.02,99.33,30.0,120.0,128.0,128.0


In [9]:
df_cleaned = df.copy()

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

In [11]:
num_cols = df_cleaned.select_dtypes(include=['float64']).columns

In [12]:
cat_cols = ['Arthritis', 'Depression', 'Diabetes', 'Exercise', 'Other_Cancer', 'Sex', 'Skin_Cancer', 'Smoking_History']

In [13]:
scaler = StandardScaler()

In [14]:
num_features = pd.DataFrame(scaler.fit_transform(df_cleaned[num_cols]), columns=df_cleaned[num_cols].columns)

In [15]:
ohe = OneHotEncoder(sparse_output=False, drop='first')

In [16]:
cat_features = ohe.fit_transform(df_cleaned[cat_cols])

In [17]:
cat_features = pd.DataFrame(cat_features, columns=ohe.get_feature_names_out())

In [18]:
age_ord = OrdinalEncoder()
ord_age_feature = age_ord.fit_transform(df_cleaned[['Age_Category']])
ord_age_feature = pd.DataFrame(ord_age_feature, columns=['Age_Category'])

In [19]:
health_ord = OrdinalEncoder(categories=[['Poor','Fair','Good','Very Good','Excellent']])
ord_health_feature = health_ord.fit_transform(df_cleaned[['General_Health']])
ord_health_feature = pd.DataFrame(ord_health_feature, columns=['General_Health'])

In [20]:
checkup_ord = OrdinalEncoder(categories=[['Within the past year','Within the past 2 years','Within the past 5 years','5 or more years ago','Never']])
ord_checkup_feature = checkup_ord.fit_transform(df_cleaned[['Checkup']])
ord_checkup_feature = pd.DataFrame(ord_checkup_feature, columns=['Checkup'])

In [21]:
features = num_features.copy()
features[ohe.get_feature_names_out()] = cat_features[ohe.get_feature_names_out()]
features['Age_Category'] = ord_age_feature['Age_Category']
features['General_Health'] = ord_health_feature['General_Health']
features['Checkup'] = ord_checkup_feature['Checkup']

In [22]:
features.to_csv('../data/interim/dataset_prepared.csv', index=False)

In [23]:
from joblib import dump

In [24]:
dump(scaler, '../models/scaler.joblib')
dump(ohe, '../models/ohe.joblib')
dump(age_ord, '../models/age_ord.joblib')
dump(health_ord, '../models/health_ord.joblib')
dump(checkup_ord, '../models/checkup_ord.joblib')

['../models/checkup_ord.joblib']

In [25]:
target = df_cleaned['Heart_Disease'].map({'Yes': 1, 'No': 0})

In [26]:
from my_krml_149874.data.sets import split_sets_random

X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(features, target, test_ratio=0.2)

In [27]:
from my_krml_149874.data.sets import save_sets

save_sets(X_train, y_train, X_val, y_val, X_test, y_test, path='../data/processed/')

In [28]:
from my_krml_149874.models.null import NullClassifier

In [29]:
base_model = NullClassifier()

In [30]:
y_base = base_model.fit_predict(y_train)

  self.pred_value = mode(y)[0][0]


In [31]:
from my_krml_149874.models.performance import print_classifier_scores

print_classifier_scores(y_preds=y_base, y_actuals=y_train, set_name='Training')

Accuracy Training: 0.9196112502158522
F1 Training: 0.0


In [32]:
from sklearn.linear_model import SGDClassifier

In [33]:
model = SGDClassifier(loss='log_loss', penalty='elasticnet', max_iter=100000, early_stopping=True, random_state=42)

In [34]:
from my_krml_149874.models.performance import fit_assess_classifier

In [35]:
model = fit_assess_classifier(model, X_train, y_train, X_val, y_val)

Accuracy Training: 0.9073184683128993
F1 Training: 0.3056959210898654
Accuracy Validation: 0.905845785239028
F1 Validation: 0.2992771084337349


In [36]:
from joblib import dump

dump(model,  '../models/sgd.joblib')

['../models/sgd.joblib']

In [40]:
git add .

SyntaxError: invalid syntax (3827820173.py, line 1)