In [1]:
! dvc remote list
! dvc pull

storage gdrive://1UgEUcsPg79rPyW9VyM2CrPc1sorwE6RE      (default)
Everything is up to date.


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path


ROOT = Path.cwd().parent
DATAPATH = ROOT / "pima_api" / "data"

In [3]:
pima = pd.read_csv(DATAPATH.joinpath("pima.csv"))
columns = pima.columns.tolist()
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
cols_toreplace = [
    col
    for col in pima.columns
    if pima[col].min() == 0 and col not in ("Pregnancies", "Outcome")
]
cols_toreplace

['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [5]:
pima[cols_toreplace] = pima[cols_toreplace].where(pima[cols_toreplace].ne(0), np.nan)
pima[cols_toreplace] = pima[cols_toreplace].apply(lambda s_: s_.fillna(s_.median()))

In [6]:
pima.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,121.65625,30.438286,44.0,99.75,117.0,140.25,199.0
BloodPressure,768.0,72.386719,12.096642,24.0,64.0,72.0,80.0,122.0
SkinThickness,768.0,29.108073,8.791221,7.0,25.0,29.0,32.0,99.0
Insulin,768.0,140.671875,86.38306,14.0,121.5,125.0,127.25,846.0
BMI,768.0,32.455208,6.875177,18.2,27.5,32.3,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


This data format suffices for the `random_forest` Classifier.

However for the `xgboost` Classifier the target column should be in the front, and the saved .csv file should not have headers (AWS format):
- index=**false**
- index_label=**'row'**
- header=**false**
- columns=*columns_in_the_right_order*

We can save the new .csv file in the *data* directory using the `pd.DataFrame.to_csv()` method but we have to also **track** it with DVC (or remove it) before pushing to GitHub

In [7]:
columns.remove("Outcome")
columns.insert(0, "Outcome")
",".join(columns)

'Outcome,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age'

In [8]:
pima_xgb = pima[columns]
pima_xgb.tail()

Unnamed: 0,Outcome,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
763,0,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,0,2,122.0,70.0,27.0,125.0,36.8,0.34,27
765,0,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,1,126.0,60.0,29.0,125.0,30.1,0.349,47
767,0,1,93.0,70.0,31.0,125.0,30.4,0.315,23


Optional:

In [None]:
"""

pima_xgb_path = DATAPATH.joinpath("pima-xgb.csv")
pima_xgb.to_csv(
    pima_xgb_path,
    index=False,
    index_label="Row",
    header=False, 
    columns=columns
    )

"""

In [None]:
# ! dvc add data/pima-xgb.csv
# ! dvc push
# ! git add -A
