# Wstępna obróbka

In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
dane = pd.read_csv('cervical-cancer_csv.csv')

# usuwanie kolumn

dane = dane.drop(['STDs:cervical condylomatosis', 
         'STDs:vaginal condylomatosis', 
         'STDs:pelvic inflammatory disease', 
         'STDs:genital herpes',
         'STDs:molluscum contagiosum',
         'STDs:AIDS',
         'STDs:Hepatitis B',
         'STDs:HPV', 'Dx:CIN'], axis=1)

# uzupełnianie braków i kodowanie zmiennych kategorycznych
def column_nodata(df, column_name):
    df[column_name + "_null"] = df[column_name].apply(lambda x: 1 if pd.isnull(x) else 0)
    df[column_name] = df[column_name].fillna(0)

def replace_in_column(df, column_name, src, dst):
    df[column_name] = df[column_name].replace(to_replace=src, value=dst)

replace_in_column(dane, 'STDs (number)', [3, 4], 2)
replace_in_column(dane, 'STDs: Number of diagnosis', [2,3], 1)

nodata_categories = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs (number)',
    'STDs:condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:HIV'
]

for category in nodata_categories:
    column_nodata(dane, category)
    
dane = pd.concat([dane, pd.get_dummies(dane['STDs (number)'], prefix='STDs_')],axis=1)
dane.drop(['STDs (number)'],axis=1, inplace=True)

# standaryzacja
numerical = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 
            'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs: Time since first diagnosis',
            'STDs: Time since last diagnosis']

scaler = MinMaxScaler() 
dane_scaled = scaler.fit_transform(dane[numerical])
d2 = pd.DataFrame(dane_scaled, columns = numerical)
dane[numerical] = d2[numerical]

# stworzenie jednego targetu
targets = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
def has_cancer(row):
    for target in targets:
        if row[target] == 1:
            return 1
    return 0

dane['cancer'] = dane.apply(lambda row: has_cancer(row), axis=1)
dane = dane.drop(targets, axis=1)


# Ujednolicone funkcje dla wszystkich modeli

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# podzial zbioru na treningowy i testowy
def default_split(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=2137)

# scoring
def scoring(y_test, y_predicted):
    print("ACC = ", accuracy_score(y_test, y_predicted))
    print("PREC = ", precision_score(y_test, y_predicted))
    print("RECALL = ", recall_score(y_test, y_predicted))
    print("F1 = ", f1_score(y_test, y_predicted))
    print("FPR = ", roc_auc_score(y_test, y_predicted))

# wyodrebnienie y
def extract_y(data):
    y = data[["cancer"]]
    return data.drop(["cancer"], axis=1), y

# Regresja

In [45]:
# przygotowanie danych
X, y = extract_y(dane)
# regresja nie pozwala na NaN w danych
X = X.fillna(0)

X_train, X_test, y_train, y_test = default_split(X, y)
print(X.shape, X_train.shape, X_test.shape)

(835, 34) (668, 34) (167, 34)


In [46]:
# regresja
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000, solver='newton-cg')

lr.fit(X_train, y_train.values.ravel())
y_predicted = lr.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8622754491017964
PREC =  0.5
RECALL =  0.043478260869565216
F1 =  0.08
FPR =  0.5182669082125604


**Accuracy** nie daje złych wyników, ale z pozostałymi miarami jest gorzej.
Dzięki **precision** wiemy, że tylko połowa z przewidzianych wyników, jest faktycznie chora.
**Recall** mówi nam, że BARDZO MAŁO spośród chorych zdiagnozowaliśmy. Na **F1** nawet nie ma co patrzyć.
**FPR** zaś jest niewiele powyżej minimum.

Wnioski:
- prawdopodobnie trzeba użyć innego klasyfikatora
- nie wygląda na to, by obróbka danych w przypadku tego klasyfikatora mogła wiele zmienić, bo jest źle, ale może warto próbować

In [47]:
from sklearn.model_selection import GridSearchCV

params = {
    #'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    #'dual': [True, False],
    'C': np.arange(0.5, 5, 0.5),
    'fit_intercept': [True, False],
    #'intercept_scaling': np.arange(0.5, 3, 0.5),
    'multi_class': ['auto', 'ovr', 'multinomial'],
    
}


grid = GridSearchCV(lr, params, error_score='raise', scoring='roc_auc')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_



Score: 0.6398115855205573


{'C': 0.5, 'fit_intercept': True, 'multi_class': 'auto'}

In [51]:
lr = LogisticRegression(max_iter=1000, solver='newton-cg', multi_class='auto', fit_intercept=True, C=0.5)
lr.fit(X_train, y_train.values.ravel())
y_predicted = lr.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8622754491017964
PREC =  0.5
RECALL =  0.043478260869565216
F1 =  0.08
FPR =  0.5182669082125604


In [52]:
lr = LogisticRegression(max_iter=1000, solver='liblinear')
params = {
    'penalty': ['l1', 'l2'],
    #'dual': [True, False],
    'C': np.arange(0.5, 5, 0.5),
    #'fit_intercept': [True, False],
    'intercept_scaling': np.arange(0.5, 3, 0.5),
    'multi_class': ['auto', 'ovr']
}
grid = GridSearchCV(lr, params, error_score='raise', scoring='roc_auc')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_



Score: 0.6323821614037504


{'C': 0.5, 'intercept_scaling': 2.5, 'multi_class': 'auto', 'penalty': 'l2'}

In [53]:
lr = LogisticRegression(max_iter=1000, solver='liblinear', multi_class='auto', intercept_scaling=2.5, penalty='l2' , C=0.5)
lr.fit(X_train, y_train.values.ravel())
y_predicted = lr.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8622754491017964
PREC =  0.5
RECALL =  0.043478260869565216
F1 =  0.08
FPR =  0.5182669082125604


In [54]:
# inny podział zbioru
dane = dane.fillna(0)

cancer = dane.loc[dane['cancer'] == 1]
no_cancer = dane.loc[dane['cancer'] == 0]

cancer_X, cancer_y = extract_y(cancer)
nocancer_X, nocancer_y = extract_y(no_cancer)


cancer_X_train, cancer_X_test, cancer_y_train, cancer_y_test = default_split(cancer_X, cancer_y)
nocancer_X_train, nocancer_X_test, nocancer_y_train, nocancer_y_test = default_split(nocancer_X, nocancer_y)

X_train = pd.concat([cancer_X_train, nocancer_X_train])
X_test = pd.concat([cancer_X_test, nocancer_X_test])
y_train = pd.concat([cancer_y_train, nocancer_y_train])
y_test = pd.concat([cancer_y_test, nocancer_y_test])

In [55]:
lr = LogisticRegression(max_iter=1000, solver='liblinear')
params = {
    'penalty': ['l1', 'l2'],
    #'dual': [True, False],
    'C': np.arange(0.5, 5, 0.5),
    #'fit_intercept': [True, False],
    'intercept_scaling': np.arange(0.5, 3, 0.5),
    'multi_class': ['auto', 'ovr']
}
grid = GridSearchCV(lr, params, error_score='raise', scoring='roc_auc')
grid.fit(X_train, y_train.values.ravel())

print(f"Score: {grid.best_score_}")
grid.best_params_



Score: 0.6617623640647844


{'C': 1.0, 'intercept_scaling': 1.0, 'multi_class': 'auto', 'penalty': 'l1'}

In [57]:
lr = LogisticRegression(max_iter=1000, solver='liblinear', multi_class='auto', intercept_scaling=1.0, penalty='l1' , C=1.0)
lr.fit(X_train, y_train.values.ravel())
y_predicted = lr.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.875
PREC =  0.0
RECALL =  0.0
F1 =  0.0
FPR =  0.5


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
