In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
# importando as features importantes
features = pd.read_csv('~/breast_cancer/description/features_mais_relevantes.csv').columns.tolist()

# importando os dados "processados" e pegando apenas as features importantes
cancer_data = pd.read_csv('~/breast_cancer/data/processed/cancer_data.csv').set_index('id')[features]

mapper = {'M':1, 'B':0}
cancer_data['diagnosis'] = cancer_data['diagnosis'].apply(lambda x: mapper[x])

# importando os indices dos conjuntos de treino e de teste
training_set_index = pd.read_csv('~/breast_cancer/data/processed/train_index.csv', header=None).values.ravel()
testing_set_index = pd.read_csv('~/breast_cancer/data/processed/test_index.csv', header=None).values.ravel()

# Utilizando apenas o conjunto de treino, o conjunto de teste será utilizado à posteriori

In [3]:
training_data = cancer_data.loc[training_set_index]

print('A proporção das classes do conjunto de treino é dado por:')
print('(M) - {}'.format(len(training_data[training_data['diagnosis'] == 1])/len(training_data)))
print('(B) - {}'.format(len(training_data[training_data['diagnosis'] == 0])/len(training_data)))

A proporção das classes do conjunto de treino é dado por:
(M) - 0.388
(B) - 0.612


# Utilizando RandomForest para classificar, sem feature engineering, como baseline

In [4]:
NUM_OF_OBSERVATIONS_TO_TRAIN = 450
NUM_OF_OBSERVATIONS_TO_TEST  = 50

X_train = training_data[training_data.columns[1:]].iloc[:NUM_OF_OBSERVATIONS_TO_TRAIN]
y_train = training_data[training_data.columns[0]].iloc[:NUM_OF_OBSERVATIONS_TO_TRAIN]

X_val = training_data[training_data.columns[1:]].iloc[NUM_OF_OBSERVATIONS_TO_TRAIN:]
y_val = training_data[training_data.columns[0]].iloc[NUM_OF_OBSERVATIONS_TO_TRAIN:]

assert NUM_OF_OBSERVATIONS_TO_TEST + NUM_OF_OBSERVATIONS_TO_TRAIN == len(training_data)

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import auc, make_scorer, roc_auc_score, recall_score

NUM_OF_CROSS_VALIDATION_SETS = 3

random_forest_classifier_param_grid = {
    'criterion':['gini', 'entropy'],
    'n_estimators':np.arange(15,200,1),
    'min_samples_split':np.arange(2,7,1),
}

grid_search_cv = GridSearchCV(estimator=RandomForestClassifier(), 
                              param_grid=random_forest_classifier_param_grid, 
                              cv=NUM_OF_CROSS_VALIDATION_SETS,
                              scoring='recall',
                              verbose=True)

grid_search_cv.fit(X_train, y_train)

  _nan_object_mask = _nan_object_array != _nan_object_array


Fitting 3 folds for each of 1850 candidates, totalling 5550 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5550 out of 5550 | elapsed: 14.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_samples_split': array([2, 3, 4, 5, 6]), 'n_estimators': array([ 15,  16, ..., 198, 199]), 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=True)

In [6]:
y_pred = grid_search_cv.predict(X_val)
results_dataframe = pd.DataFrame({'classes_reais':y_val, 'classes_previsao':y_pred})
results_dataframe.head()

Unnamed: 0_level_0,classes_previsao,classes_reais
id,Unnamed: 1_level_1,Unnamed: 2_level_1
8510824,0,0
90769602,0,0
881861,1,1
90439701,1,1
87106,0,0


# Carregando agora o conjunto nunca visto para testarmos o modelo

In [7]:
grid_search_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=67, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
testing_data = cancer_data.loc[testing_set_index]

X_test = testing_data[testing_data.columns[1:]].values
y_test = testing_data[testing_data.columns[0]].values

y_real_test = testing_data['diagnosis'].values
y_pred_test = grid_search_cv.predict(X_test)

result_test_dataframe = pd.DataFrame({'classes reais':y_test, 'classes previstas':y_pred_test}, index=testing_data.index)

In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_true=result_test_dataframe['classes reais'].values,
                            y_pred=result_test_dataframe['classes previstas'].values)
     )

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       147
           1       0.95      0.93      0.94        81

   micro avg       0.96      0.96      0.96       228
   macro avg       0.95      0.95      0.95       228
weighted avg       0.96      0.96      0.96       228



In [10]:
result_test_dataframe

Unnamed: 0_level_0,classes previstas,classes reais
id,Unnamed: 1_level_1,Unnamed: 2_level_1
842517,1,1
84300903,1,1
843786,0,1
844359,1,1
844981,1,1
845636,0,1
846226,1,1
84667401,1,1
84799002,1,1
849014,1,1
