# Wstępna obróbka

In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
dane = pd.read_csv('cervical-cancer_csv.csv')

# usuwanie kolumn

dane = dane.drop(['STDs:cervical condylomatosis', 
         'STDs:vaginal condylomatosis', 
         'STDs:pelvic inflammatory disease', 
         'STDs:genital herpes',
         'STDs:molluscum contagiosum',
         'STDs:AIDS',
         'STDs:Hepatitis B',
         'STDs:HPV', 'Dx:CIN'], axis=1)

# uzupełnianie braków i kodowanie zmiennych kategorycznych
def column_nodata(df, column_name):
    df[column_name + "_null"] = df[column_name].apply(lambda x: 1 if pd.isnull(x) else 0)
    df[column_name] = df[column_name].fillna(0)

def replace_in_column(df, column_name, src, dst):
    df[column_name] = df[column_name].replace(to_replace=src, value=dst)

replace_in_column(dane, 'STDs (number)', [3, 4], 2)
replace_in_column(dane, 'STDs: Number of diagnosis', [2,3], 1)

nodata_categories = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs (number)',
    'STDs:condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:HIV'
]

for category in nodata_categories:
    column_nodata(dane, category)
    
dane = pd.concat([dane, pd.get_dummies(dane['STDs (number)'], prefix='STDs_')],axis=1)
dane.drop(['STDs (number)'],axis=1, inplace=True)

# standaryzacja
numerical = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 
            'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs: Time since first diagnosis',
            'STDs: Time since last diagnosis']

scaler = MinMaxScaler() 
dane_scaled = scaler.fit_transform(dane[numerical])
d2 = pd.DataFrame(dane_scaled, columns = numerical)
dane[numerical] = d2[numerical]

# stworzenie jednego targetu
targets = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
def has_cancer(row):
    for target in targets:
        if row[target] == 1:
            return 1
    return 0

dane['cancer'] = dane.apply(lambda row: has_cancer(row), axis=1)
dane = dane.drop(targets, axis=1)


# Ujednolicone funkcje dla wszystkich modeli

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# podzial zbioru na treningowy i testowy
def default_split(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=2137)

# scoring
def scoring(y_test, y_predicted):
    print("ACC = ", accuracy_score(y_test, y_predicted))
    print("PREC = ", precision_score(y_test, y_predicted))
    print("RECALL = ", recall_score(y_test, y_predicted))
    print("F1 = ", f1_score(y_test, y_predicted))
    print("FPR = ", roc_auc_score(y_test, y_predicted))

# wyodrebnienie y
def extract_y(data):
    y = data[["cancer"]]
    return data.drop(["cancer"], axis=1), y

# Regresja

In [53]:
# przygotowanie danych
X, y = extract_y(dane)
# regresja nie pozwala na NaN w danych
X = X.fillna(0)

X_train, X_test, y_train, y_test = default_split(X, y)
print(X.shape, X_train.shape, X_test.shape)

(835, 34) (668, 34) (167, 34)


In [9]:
# regresja
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000, solver='lbfgs')

lr.fit(X_train, y_train.values.ravel())
y_predicted = lr.predict(X_test)
scoring(y_test, y_predicted)

ACC =  0.8622754491017964
PREC =  0.5
RECALL =  0.043478260869565216
F1 =  0.08
FPR =  0.5182669082125604


**Accuracy** nie daje złych wyników, ale z pozostałymi miarami jest gorzej.
Dzięki **precision** wiemy, że tylko połowa z przewidzianych wyników, jest faktycznie chora.
**Recall** mówi nam, że BARDZO MAŁO spośród chorych zdiagnozowaliśmy. Na **F1** nawet nie ma co patrzyć.
**FPR** zaś jest niewiele powyżej minimum.

Wnioski:
- prawdopodobnie trzeba użyć innego klasyfikatora
- nie wygląda na to, by obróbka danych w przypadku tego klasyfikatora mogła wiele zmienić, bo jest źle, ale może warto próbować

In [23]:
from sklearn.model_selection import cross_val_score

def test_for_params(solver, penalty, dual, fit_intercept, multi_class):
    lr = LogisticRegression(max_iter=1000, solver=solver, penalty=penalty, dual=dual,
                            fit_intercept=fit_intercept, multi_class=multi_class)
    #lr.fit(X_train, y_train.values.ravel())
    auc = np.median(cross_val_score(lr, X_train, y_train.values.ravel(), scoring='roc_auc', cv = 10))
    return auc

In [54]:
params = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'dual': [True, False],
    'fit_intercept': [True, False],
    'multi_class': ['auto', 'ovr', 'multinomial']
}
row_list = []
#scores = pd.DataFrame(columns = ['solver', 'penalty', 'dual', 'fit_intercept', 'multi_class', 'auc'])
for solver in params['solver']:
    for penalty in params['penalty']:
        for dual in params['dual']:
            for fit_intercept in params['fit_intercept']:
                for multi_class in params['multi_class']:
                    try:
                        auc = test_for_params(solver, penalty, dual, fit_intercept, multi_class)
                        row_list.append({'solver': solver, 'penalty': penalty, 'dual': dual, 'fit_intercept': fit_intercept,
                                        'multi_class': multi_class, 'auc': auc})
                    except:
                        print("Invalid params", solver, penalty, dual, fit_intercept, multi_class)

scores = pd.DataFrame(row_list)
scores



Invalid params newton-cg l1 True True auto
Invalid params newton-cg l1 True True ovr
Invalid params newton-cg l1 True True multinomial
Invalid params newton-cg l1 True False auto
Invalid params newton-cg l1 True False ovr
Invalid params newton-cg l1 True False multinomial
Invalid params newton-cg l1 False True auto
Invalid params newton-cg l1 False True ovr
Invalid params newton-cg l1 False True multinomial
Invalid params newton-cg l1 False False auto
Invalid params newton-cg l1 False False ovr
Invalid params newton-cg l1 False False multinomial
Invalid params newton-cg l2 True True auto
Invalid params newton-cg l2 True True ovr
Invalid params newton-cg l2 True True multinomial
Invalid params newton-cg l2 True False auto
Invalid params newton-cg l2 True False ovr
Invalid params newton-cg l2 True False multinomial




Invalid params newton-cg elasticnet True True auto
Invalid params newton-cg elasticnet True True ovr
Invalid params newton-cg elasticnet True True multinomial
Invalid params newton-cg elasticnet True False auto
Invalid params newton-cg elasticnet True False ovr
Invalid params newton-cg elasticnet True False multinomial
Invalid params newton-cg elasticnet False True auto
Invalid params newton-cg elasticnet False True ovr
Invalid params newton-cg elasticnet False True multinomial
Invalid params newton-cg elasticnet False False auto
Invalid params newton-cg elasticnet False False ovr
Invalid params newton-cg elasticnet False False multinomial
Invalid params newton-cg none True True auto
Invalid params newton-cg none True True ovr
Invalid params newton-cg none True True multinomial
Invalid params newton-cg none True False auto
Invalid params newton-cg none True False ovr
Invalid params newton-cg none True False multinomial




Invalid params lbfgs l1 True True auto
Invalid params lbfgs l1 True True ovr
Invalid params lbfgs l1 True True multinomial
Invalid params lbfgs l1 True False auto
Invalid params lbfgs l1 True False ovr
Invalid params lbfgs l1 True False multinomial
Invalid params lbfgs l1 False True auto
Invalid params lbfgs l1 False True ovr
Invalid params lbfgs l1 False True multinomial
Invalid params lbfgs l1 False False auto
Invalid params lbfgs l1 False False ovr
Invalid params lbfgs l1 False False multinomial
Invalid params lbfgs l2 True True auto
Invalid params lbfgs l2 True True ovr
Invalid params lbfgs l2 True True multinomial
Invalid params lbfgs l2 True False auto
Invalid params lbfgs l2 True False ovr
Invalid params lbfgs l2 True False multinomial




Invalid params lbfgs elasticnet True True auto
Invalid params lbfgs elasticnet True True ovr
Invalid params lbfgs elasticnet True True multinomial
Invalid params lbfgs elasticnet True False auto
Invalid params lbfgs elasticnet True False ovr
Invalid params lbfgs elasticnet True False multinomial
Invalid params lbfgs elasticnet False True auto
Invalid params lbfgs elasticnet False True ovr
Invalid params lbfgs elasticnet False True multinomial
Invalid params lbfgs elasticnet False False auto
Invalid params lbfgs elasticnet False False ovr
Invalid params lbfgs elasticnet False False multinomial
Invalid params lbfgs none True True auto
Invalid params lbfgs none True True ovr
Invalid params lbfgs none True True multinomial
Invalid params lbfgs none True False auto
Invalid params lbfgs none True False ovr
Invalid params lbfgs none True False multinomial




Invalid params liblinear l1 True True auto
Invalid params liblinear l1 True True ovr
Invalid params liblinear l1 True True multinomial
Invalid params liblinear l1 True False auto
Invalid params liblinear l1 True False ovr
Invalid params liblinear l1 True False multinomial
Invalid params liblinear l1 False True multinomial




Invalid params liblinear l1 False False multinomial
Invalid params liblinear l2 True True multinomial
Invalid params liblinear l2 True False multinomial




Invalid params liblinear l2 False True multinomial
Invalid params liblinear l2 False False multinomial
Invalid params liblinear elasticnet True True auto
Invalid params liblinear elasticnet True True ovr
Invalid params liblinear elasticnet True True multinomial
Invalid params liblinear elasticnet True False auto
Invalid params liblinear elasticnet True False ovr
Invalid params liblinear elasticnet True False multinomial
Invalid params liblinear elasticnet False True auto
Invalid params liblinear elasticnet False True ovr
Invalid params liblinear elasticnet False True multinomial
Invalid params liblinear elasticnet False False auto
Invalid params liblinear elasticnet False False ovr
Invalid params liblinear elasticnet False False multinomial
Invalid params liblinear none True True auto
Invalid params liblinear none True True ovr
Invalid params liblinear none True True multinomial
Invalid params liblinear none True False auto
Invalid params liblinear none True False ovr
Invalid params li



Invalid params sag elasticnet True True auto
Invalid params sag elasticnet True True ovr
Invalid params sag elasticnet True True multinomial
Invalid params sag elasticnet True False auto
Invalid params sag elasticnet True False ovr
Invalid params sag elasticnet True False multinomial
Invalid params sag elasticnet False True auto
Invalid params sag elasticnet False True ovr
Invalid params sag elasticnet False True multinomial
Invalid params sag elasticnet False False auto
Invalid params sag elasticnet False False ovr
Invalid params sag elasticnet False False multinomial
Invalid params sag none True True auto
Invalid params sag none True True ovr
Invalid params sag none True True multinomial
Invalid params sag none True False auto
Invalid params sag none True False ovr
Invalid params sag none True False multinomial






Invalid params saga l1 True True auto
Invalid params saga l1 True True ovr
Invalid params saga l1 True True multinomial
Invalid params saga l1 True False auto
Invalid params saga l1 True False ovr
Invalid params saga l1 True False multinomial




Invalid params saga l2 True True auto
Invalid params saga l2 True True ovr
Invalid params saga l2 True True multinomial
Invalid params saga l2 True False auto
Invalid params saga l2 True False ovr
Invalid params saga l2 True False multinomial




Invalid params saga elasticnet True True auto
Invalid params saga elasticnet True True ovr
Invalid params saga elasticnet True True multinomial
Invalid params saga elasticnet True False auto
Invalid params saga elasticnet True False ovr
Invalid params saga elasticnet True False multinomial
Invalid params saga elasticnet False True auto
Invalid params saga elasticnet False True ovr
Invalid params saga elasticnet False True multinomial
Invalid params saga elasticnet False False auto
Invalid params saga elasticnet False False ovr
Invalid params saga elasticnet False False multinomial
Invalid params saga none True True auto
Invalid params saga none True True ovr
Invalid params saga none True True multinomial
Invalid params saga none True False auto
Invalid params saga none True False ovr
Invalid params saga none True False multinomial






Unnamed: 0,solver,penalty,dual,fit_intercept,multi_class,auc
0,newton-cg,l2,False,True,auto,0.651483
1,newton-cg,l2,False,True,ovr,0.651483
2,newton-cg,l2,False,True,multinomial,0.650424
3,newton-cg,l2,False,False,auto,0.673880
4,newton-cg,l2,False,False,ovr,0.673880
...,...,...,...,...,...,...
61,saga,none,False,True,ovr,0.601695
62,saga,none,False,True,multinomial,0.601695
63,saga,none,False,False,auto,0.601695
64,saga,none,False,False,ovr,0.601695


In [55]:
scores.sort_values(by="auc", ascending=False)

Unnamed: 0,solver,penalty,dual,fit_intercept,multi_class,auc
25,liblinear,l1,False,True,ovr,0.679555
24,liblinear,l1,False,True,auto,0.679555
17,lbfgs,l2,False,False,multinomial,0.678117
5,newton-cg,l2,False,False,multinomial,0.678117
59,saga,l2,False,False,multinomial,0.678117
...,...,...,...,...,...,...
45,sag,none,False,False,auto,0.601695
44,sag,none,False,True,multinomial,0.601695
43,sag,none,False,True,ovr,0.601695
42,sag,none,False,True,auto,0.601695


In [59]:
# solver='liblinear', penalty='l1', dual=False, fit_intercept=True, multi_class='ovr'
# C=1,5 1 2 5
# intercept_scaling 0,5 1 2 5
# verbose 0 1
linear_params = {
    'C': [1.5, 1, 2, 5],
    'intercept_scaling': [0.5, 1, 2, 5],
    'verbose': [0, 1]
}

row_list = []
solver='liblinear'
penalty='l1'
dual=False
fit_intercept=True
multi_class='auto'

for C in linear_params['C']:
    for intercept_scaling in linear_params['intercept_scaling']:
        for verbose in linear_params['verbose']:
            auc = np.median(
                cross_val_score(
                    LogisticRegression(max_iter=1000, solver=solver, penalty=penalty, dual=dual,
                                       fit_intercept=fit_intercept, multi_class=multi_class, C=C,
                                       intercept_scaling=intercept_scaling, verbose=verbose),
                    X_train, y_train.values.ravel(), scoring='roc_auc', cv = 10))
            row_list.append({'C': C, 'intercept_scaling': intercept_scaling, 'verbose': verbose, 'auc': auc})

linear_scores = pd.DataFrame(row_list)
linear_scores.sort_values(by="auc", ascending=False)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

Unnamed: 0,C,intercept_scaling,verbose,auc
11,1.0,1.0,1,0.679555
10,1.0,1.0,0,0.679555
13,1.0,2.0,1,0.676907
12,1.0,2.0,0,0.676907
15,1.0,5.0,1,0.675847
14,1.0,5.0,0,0.675847
26,5.0,1.0,0,0.659504
27,5.0,1.0,1,0.659504
24,5.0,0.5,0,0.659504
25,5.0,0.5,1,0.659504
