In [146]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
dane = pd.read_csv('cervical-cancer_csv.csv')

# usuwanie kolumn

dane = dane.drop(['STDs:cervical condylomatosis', 
         'STDs:vaginal condylomatosis', 
         'STDs:pelvic inflammatory disease', 
         'STDs:genital herpes',
         'STDs:molluscum contagiosum',
         'STDs:AIDS',
         'STDs:Hepatitis B',
         'STDs:HPV', 'Dx:CIN'], axis=1)

# uzupełnianie braków i kodowanie zmiennych kategorycznych
def column_nodata(df, column_name):
    df[column_name + "_null"] = df[column_name].apply(lambda x: 1 if pd.isnull(x) else 0)
    df[column_name] = df[column_name].fillna(0)

def replace_in_column(df, column_name, src, dst):
    df[column_name] = df[column_name].replace(to_replace=src, value=dst)

replace_in_column(dane, 'STDs (number)', [3, 4], 2)
replace_in_column(dane, 'STDs: Number of diagnosis', [2,3], 1)

nodata_categories = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs (number)',
    'STDs:condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:HIV'
]

for category in nodata_categories:
    column_nodata(dane, category)
    
dane = pd.concat([dane, pd.get_dummies(dane['STDs (number)'], prefix='STDs_')],axis=1)
dane.drop(['STDs (number)'],axis=1, inplace=True)

# standaryzacja
numerical = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 
            'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs: Time since first diagnosis',
            'STDs: Time since last diagnosis']

scaler = MinMaxScaler() #MinMaxScaler
dane_scaled = scaler.fit_transform(dane[numerical])
d2 = pd.DataFrame(dane_scaled, columns = numerical)
dane[numerical] = d2[numerical]

# stworzenie jednego targetu
targets = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
def has_cancer(row):
    for target in targets:
        if row[target] == 1:
            return 1
    return 0

dane['cancer'] = dane.apply(lambda row: has_cancer(row), axis=1)
dane = dane.drop(targets, axis=1)

In [128]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# podzial zbioru na treningowy i testowy
def default_split(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=2137)

# scoring
def scoring(y_test, y_predicted):
    print("ACC = ", accuracy_score(y_test, y_predicted))
    print("PREC = ", precision_score(y_test, y_predicted))
    print("RECALL = ", recall_score(y_test, y_predicted))
    print("F1 = ", f1_score(y_test, y_predicted))
    print("FPR = ", roc_auc_score(y_test, y_predicted))

# wyodrebnienie y
def extract_y(data):
    y = data[["cancer"]]
    return data.drop(["cancer"], axis=1), y

In [147]:
# przygotowanie danych
X, y = extract_y(dane)

X = X.fillna(0)

X_train, X_test, y_train, y_test = default_split(X, y)

In [130]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=2137, missing=np.nan)
model.fit(X_train, y_train.values.ravel())
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8323353293413174
PREC =  0.2727272727272727
RECALL =  0.13043478260869565
F1 =  0.1764705882352941
FPR =  0.53743961352657


In [132]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': np.arange(5, 50, 5),
    'learning_rate': np.arange(0.1, 2, 0.3),
    'booster': ['gbtree', 'gblinear', 'dart'],
}


grid = GridSearchCV(model, params, error_score='raise', cv=5, scoring='roc_auc')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_

Score: 0.628956440931696




{'booster': 'gbtree', 'learning_rate': 0.4, 'max_depth': 20}

In [133]:
model = XGBClassifier(random_state=2137, booster='gbtree', learning_rate=0.4, max_depth=20)
model.fit(X_train, y_train.values.ravel())
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8203592814371258
PREC =  0.1111111111111111
RECALL =  0.043478260869565216
F1 =  0.0625
FPR =  0.4939613526570048


In [134]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'max_depth': np.arange(5, 50, 5),
    'learning_rate': np.arange(0.01, 0.2, 0.01),
}

model = XGBClassifier(random_state=2137, booster='gbtree')
grid = RandomizedSearchCV(model, params, error_score='raise', cv=5, scoring='roc_auc')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_

Score: 0.6179271224500152


{'max_depth': 25, 'learning_rate': 0.05}

In [148]:
model = XGBClassifier(random_state=2137, booster='gbtree', learning_rate=0.5, max_depth=7, gamma=0.001)
#model = XGBClassifier(random_state=2137, booster='gbtree', learning_rate=0.05, max_depth=25)
model.fit(X_train, y_train.values.ravel())
y_predicted = model.predict(X_train)
scoring(y_train, y_predicted)
print("")
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.9940119760479041
PREC =  1.0
RECALL =  0.9487179487179487
F1 =  0.9736842105263158
FPR =  0.9743589743589743

ACC =  0.8383233532934131
PREC =  0.375
RECALL =  0.2608695652173913
F1 =  0.30769230769230765
FPR =  0.5957125603864735


In [122]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'gamma': np.arange(0.01, 0.5, 0.01),
}

model = XGBClassifier(random_state=2137, booster='gbtree', learning_rate=0.5, max_depth=7)
grid = RandomizedSearchCV(model, params, error_score='raise', cv=5, scoring='roc_auc')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_

Score: 0.6225687924997464




{'gamma': 0.08}