# Modelos predictivos para el trabajo de SIT1

In [124]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, ADASYN

In [129]:
def load_data():
    # para la version de prototipo tenemos los datos
    # en el mismo directorio
    data = pd.read_csv('cervix_cancer/risk_factors_cervical_cancer.csv', sep=',')
    # cambia el simbolo ? por nan
    data.replace(to_replace='?', value=np.nan, inplace=True)
    
    # separar los targets: Hinselmann, Schiller, Citology, Biopsy
    labels = {
        'Hinselmann': data['Hinselmann'],
        'Schiller': data['Schiller'],
        'Citology': data['Citology'],
        'Biopsy': data['Biopsy']
    }
    # elimina los targets del dataframe
    del data['Hinselmann']
    del data['Schiller']
    del data['Citology']
    del data['Biopsy']
    # retorn las caracteristicas
    # y un diccionario con las diferentes etiquetas
    return data, labels

In [16]:
def preprocess(df, imputer=None):
    # imputa los valores ausentes (NA)
    # en este dataset no hay variables categoricas
    if imputer == None:
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        imputer.fit(df)
    return imputer.transform(df), imputer

In [132]:
def main(pca_f=False):    
    # main function
    df, labels = load_data()
    print('Number of features:', df.shape[1])
    print('Number of samples:', df.shape[0])
    aucs = []
    for _ in range(100):
        # divide en train y test
        X_train, X_test, y_train, y_test = train_test_split(df, labels['Biopsy'], 
                                                   test_size=0.1, stratify=labels['Biopsy'])

        # procesa los datos
        X_train, imputer = preprocess(X_train)
        X_test, _ = preprocess(X_test, imputer)
        # metodo de oversample para la clase positiva
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        # aplica PCA
        if pca_f:
            pca = PCA(n_components=20)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)

        # crea el modelo y entrena
        rf = RandomForestClassifier(n_estimators=50)
        rf.fit(X_train, y_train)
        # test
        probabilities = rf.predict_proba(X_test)[:, 1]
        scr_ = roc_auc_score(y_true=y_test, y_score=probabilities)
        aucs.append(scr_)
    print('Media de aucs:', np.mean(aucs))
        
main(pca_f=True)        

Number of features: 32
Number of samples: 858
Media de aucs: 0.6373125000000001
