In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
dane = pd.read_csv('cervical-cancer_csv.csv')

# usuwanie kolumn

dane = dane.drop(['STDs:cervical condylomatosis', 
         'STDs:vaginal condylomatosis', 
         'STDs:pelvic inflammatory disease', 
         'STDs:genital herpes',
         'STDs:molluscum contagiosum',
         'STDs:AIDS',
         'STDs:Hepatitis B',
         'STDs:HPV', 'Dx:CIN'], axis=1)

# uzupełnianie braków i kodowanie zmiennych kategorycznych
def column_nodata(df, column_name):
    df[column_name + "_null"] = df[column_name].apply(lambda x: 1 if pd.isnull(x) else 0)
    df[column_name] = df[column_name].fillna(0)

def replace_in_column(df, column_name, src, dst):
    df[column_name] = df[column_name].replace(to_replace=src, value=dst)

replace_in_column(dane, 'STDs (number)', [3, 4], 2)
replace_in_column(dane, 'STDs: Number of diagnosis', [2,3], 1)

nodata_categories = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs (number)',
    'STDs:condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:HIV'
]

for category in nodata_categories:
    column_nodata(dane, category)
    
dane = pd.concat([dane, pd.get_dummies(dane['STDs (number)'], prefix='STDs_')],axis=1)
dane.drop(['STDs (number)'],axis=1, inplace=True)

# standaryzacja
numerical = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 
            'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs: Time since first diagnosis',
            'STDs: Time since last diagnosis']

scaler = MinMaxScaler() 
dane_scaled = scaler.fit_transform(dane[numerical])
d2 = pd.DataFrame(dane_scaled, columns = numerical)
dane[numerical] = d2[numerical]

# stworzenie jednego targetu
targets = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
def has_cancer(row):
    for target in targets:
        if row[target] == 1:
            return 1
    return 0

dane['cancer'] = dane.apply(lambda row: has_cancer(row), axis=1)
dane = dane.drop(targets, axis=1)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# podzial zbioru na treningowy i testowy
def default_split(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=2137)

# scoring
def scoring(y_test, y_predicted):
    print("ACC = ", accuracy_score(y_test, y_predicted))
    print("PREC = ", precision_score(y_test, y_predicted))
    print("RECALL = ", recall_score(y_test, y_predicted))
    print("F1 = ", f1_score(y_test, y_predicted))
    print("FPR = ", roc_auc_score(y_test, y_predicted))

# wyodrebnienie y
def extract_y(data):
    y = data[["cancer"]]
    return data.drop(["cancer"], axis=1), y

In [6]:
# przygotowanie danych
X, y = extract_y(dane)
# adaboost nie pozwala na NaN w danych
X = X.fillna(0)

X_train, X_test, y_train, y_test = default_split(X, y)

In [9]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=2137)
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8562874251497006
PREC =  0.4
RECALL =  0.08695652173913043
F1 =  0.14285714285714285
FPR =  0.5330615942028984


  y = column_or_1d(y, warn=True)


In [14]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'n_estimators': np.arange(10, 210, 10),
    'learning_rate': np.arange(0.5, 3, 0.2),
    'algorithm': ['SAMME', 'SAMME.R']
}


grid = RandomizedSearchCV(model, params, error_score='raise')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_

  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_

  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  return self.classes_.take(pred > 0, axis=0)
  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weig

  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |


  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |


Score: 0.8832335329341318


  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |


{'n_estimators': 190,
 'learning_rate': 2.4999999999999996,
 'algorithm': 'SAMME'}

In [18]:
model = AdaBoostClassifier(random_state=2137, n_estimators=190, learning_rate=2.5, algorithm='SAMME')
model.fit(X_train, y_train.values.ravel())
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)

  (estimator_weight < 0)))
  sample_weight /= sample_weight_sum
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_

ACC =  0.8622754491017964
PREC =  0.0
RECALL =  0.0
F1 =  0.0
FPR =  0.5


  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  ((sample_weight > 0) |
  return self.classes_.take(pred > 0, axis=0)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
