### Importing dependencies

In [50]:
import pandas as pd

from src.data.load_data import load_data_csv
from src.features.features import preprocessing_data
from src.models.save import save_model
from src.models.test import accuracy_model, recall_model, precision_model
from src.models.train import train_svm_model, train_logistic_reg_model, train_knn_model, train_random_forest_model, \
    fine_tuning_model

### Model Training

In [2]:
obesity_data = load_data_csv("../../data/processed/obesity_data_clean.csv")

In [3]:
obesity_data.head()

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,NObeyesdad
0,21,1.62,64.0,1,1
1,21,1.52,56.0,1,1
2,23,1.8,77.0,1,1
3,27,1.8,87.0,0,5
4,22,1.78,89.8,0,6


In [4]:
(X_train, y_train), (X_val, y_val), (X_test, y_test) = preprocessing_data(obesity_data, "NObeyesdad")

In [5]:
X_train.shape, y_train.shape

((1669, 4), (1669,))

In [6]:
X_test.shape, y_test.shape

((209, 4), (209,))

In [7]:
X_val.shape, y_val.shape

((209, 4), (209,))

In [8]:
svm_model = train_svm_model(X_train, y_train)
log_reg_model = train_logistic_reg_model(X_train, y_train)
knn_model = train_knn_model(X_train, y_train)
rnd_forest_model = train_random_forest_model(X_train, y_train)

### Validation

In [9]:
acc_svm_tr = accuracy_model(svm_model, X_train, y_train) 
acc_log_reg_tr = accuracy_model(log_reg_model, X_train, y_train) 
acc_knn_tr = accuracy_model(knn_model, X_train, y_train) 
acc_rnd_forest_tr = accuracy_model(rnd_forest_model, X_train, y_train) 

In [10]:
acc_svm_val = accuracy_model(svm_model, X_val, y_val) 
acc_log_reg_val = accuracy_model(log_reg_model, X_val, y_val) 
acc_knn_val = accuracy_model(knn_model, X_val, y_val) 
acc_rnd_forest_val = accuracy_model(rnd_forest_model, X_val, y_val) 

In [11]:
pd.DataFrame(data=[
    [acc_svm_tr, acc_knn_tr, acc_log_reg_tr, acc_rnd_forest_tr],
    [acc_svm_val, acc_knn_val, acc_log_reg_val, acc_rnd_forest_val]
], index=["acc_train", "acc_val"], columns=["svm", "knn", "log_reg", "rnd_forest"])

Unnamed: 0,svm,knn,log_reg,rnd_forest
acc_train,0.893349,0.943679,0.653086,1.0
acc_val,0.880383,0.909091,0.674641,0.961722


### Model fine tuning
Random Forest is the best model among all models

In [37]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  
    'C': [0.001, 0.01, 0.1, 1, 10, 100],          
    'solver': ['liblinear', 'lbfgs', 'saga'],     
    'max_iter': [100, 200, 500]                   
}



# grid_params = {
#     # 'n_estimators': [100],      
#     # 'max_depth': [10, 20],
#     'C': [0.1, 1, 10, 100],            
#     'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 
#     'kernel': ['linear', 'rbf', 'poly']
# }
grid_model = fine_tuning_model(log_reg_model, X_train, y_train, param_grid, cv=4)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [38]:
final_model = grid_model.best_estimator_

### Model evaluation

In [39]:
acc_train = accuracy_model(final_model, X_train, y_train)
acc_test = accuracy_model(final_model, X_test, y_test)

In [40]:
recall_train = recall_model(final_model, X_train, y_train)
recall_test = recall_model(final_model, X_test, y_test)

In [41]:
prec_train = precision_model(final_model, X_train, y_train)
prec_test = precision_model(final_model, X_test, y_test)

In [42]:
pd.DataFrame(data=[
    [acc_train, recall_train, prec_train],
    [acc_test, recall_test, prec_test]
], index=["train", "test"], columns=["accuracy", "recall", "precision"])

Unnamed: 0,accuracy,recall,precision
train,0.947274,0.947274,0.947325
test,0.952153,0.952153,0.952553


In [52]:
save_model(final_model, "obesity_pred_model")

### Prediction test

In [43]:
samples = obesity_data.iloc[20:31]
samples

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,NObeyesdad
20,22,1.65,80.0,1,6
21,52,1.69,87.0,1,2
22,22,1.65,60.0,1,1
23,22,1.6,82.0,1,2
24,21,1.85,68.0,1,1
25,20,1.6,50.0,1,1
26,21,1.7,65.0,1,1
27,23,1.6,52.0,0,1
28,19,1.75,76.0,1,1
29,23,1.68,70.0,0,1


In [44]:
samples = samples.drop(columns=["NObeyesdad"], axis=1)

In [45]:
samples = samples.to_numpy()

In [46]:
samples.shape

(11, 4)

In [51]:
print(final_model.predict(samples))

[4 4 4 4 4 4 4 4 4 4 4]
