In [32]:
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [33]:
df = pd.read_csv(r"../data/cancer.csv")
df.drop(df.columns[[0,-1]], axis=1, inplace=True)
# Split the features data and the target 
Xdata = df.drop(['diagnosis'], axis=1)
ydata = df['diagnosis']

In [34]:
# Encoding the target value 
yenc = np.asarray([1 if c == 'M' else 0 for c in ydata])
cols = ['concave points_mean','area_mean','radius_mean','perimeter_mean','concavity_mean',]

In [35]:
Xdata = df[cols]
print(Xdata.columns)

Index(['concave points_mean', 'area_mean', 'radius_mean', 'perimeter_mean',
       'concavity_mean'],
      dtype='object')


In [36]:
summary = Xdata.describe()

summary.to_dict('dict')

summary = summary.transpose()
summary = summary[['mean','25%','75%']].round(decimals=3).transpose()
summary.to_dict('dict')

{'concave points_mean': {'mean': 0.049, '25%': 0.02, '75%': 0.074},
 'area_mean': {'mean': 654.889, '25%': 420.3, '75%': 782.7},
 'radius_mean': {'mean': 14.127, '25%': 11.7, '75%': 15.78},
 'perimeter_mean': {'mean': 91.969, '25%': 75.17, '75%': 104.1},
 'concavity_mean': {'mean': 0.089, '25%': 0.03, '75%': 0.131}}

In [38]:
X_train, X_test, y_train, y_test = train_test_split(Xdata, yenc, 
                                                    test_size=0.3,
                                                    random_state=43)
print('Set de entrenamiento: X:{}, y:{}'.format(X_train.shape, y_train.shape))
print('Set de prueba: X:{}, y:{}'.format(X_test.shape, y_test.shape))

model = ensemble.RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Precisión : {}'.format(accuracy_score(y_test, y_pred)))

clf_report = classification_report(y_test, y_pred)
print('Informe de clasificación')
print("---------------------")
print(clf_report)
print("_____________________")

joblib.dump(model,r"../cancer_model.pkl")

Set de entrenamiento: X:(398, 5), y:(398,)
Set de prueba: X:(171, 5), y:(171,)
Precisión : 0.9415204678362573
Informe de clasificación
---------------------
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       114
           1       0.90      0.93      0.91        57

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.93       171
weighted avg       0.94      0.94      0.94       171

_____________________


['../cancer_model.pkl']