In [3]:
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

df = pd.read_csv(r"../data/diabetes.csv")

df['Glucose'].replace(0, np.nan, inplace=True)
df['BloodPressure'].replace(0, np.nan, inplace=True)
df['SkinThickness'].replace(0, np.nan, inplace=True)
df['Insulin'].replace(0, np.nan, inplace=True)
df['BMI'].replace(0, np.nan, inplace=True)
df.drop(['SkinThickness' ,'Insulin'], axis=1, inplace=True)
df = df.dropna(axis=0)

cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome']

df_outcome_1 = df[df['Outcome'] == 1].copy()
i = len(df_outcome_1)
df_outcome_0 = df[df['Outcome'] == 0].sample(i, random_state=1)
df_balanced = df_outcome_0.append(df_outcome_1)
df_balanced = df[cols]


y = df_balanced['Outcome']
X = df_balanced.drop('Outcome', axis=1)
summary = X.describe()


summary.to_dict('dict')

summary = summary.transpose()
summary = summary[['mean','25%','75%']].round(decimals=3).transpose()
summary.to_dict('dict')

{'Pregnancies': {'mean': 3.866, '25%': 1.0, '75%': 6.0},
 'Glucose': {'mean': 121.883, '25%': 99.75, '75%': 142.0},
 'BloodPressure': {'mean': 72.401, '25%': 64.0, '75%': 80.0},
 'BMI': {'mean': 32.467, '25%': 27.5, '75%': 36.6},
 'DiabetesPedigreeFunction': {'mean': 0.475, '25%': 0.245, '75%': 0.628},
 'Age': {'mean': 33.351, '25%': 24.0, '75%': 41.0}}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2
                                                    , stratify=y, random_state=1)
print('Set de entrenamiento: X:{}, y:{}'.format(X_train.shape, y_train.shape))
print('Set de prueba: X:{}, y:{}'.format(X_test.shape, y_test.shape))

model = ensemble.RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Precisión : {}'.format(accuracy_score(y_test, y_pred)))

clf_report = classification_report(y_test, y_pred)
print('Informe de clasificación')
print("---------------------")
print(clf_report)
print("_____________________")

joblib.dump(model,r"../diabetes_model.pkl")