# Wstępna obróbka

In [2]:
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
dane = pd.read_csv('cervical-cancer_csv.csv')

# usuwanie kolumn
dane = dane.drop(['STDs:cervical condylomatosis', 
         'STDs:vaginal condylomatosis', 
         'STDs:pelvic inflammatory disease', 
         'STDs:genital herpes',
         'STDs:molluscum contagiosum',
         'STDs:AIDS',
         'STDs:Hepatitis B',
         'STDs:HPV', 'Dx:CIN'], axis=1)

# uzupełnianie braków i kodowanie zmiennych kategorycznych
def column_nodata(df, column_name):
    df[column_name + "_null"] = df[column_name].apply(lambda x: 1 if pd.isnull(x) else 0)
    df[column_name] = df[column_name].fillna(0)

def replace_in_column(df, column_name, src, dst):
    df[column_name] = df[column_name].replace(to_replace=src, value=dst)

replace_in_column(dane, 'STDs (number)', [3, 4], 2)
replace_in_column(dane, 'STDs: Number of diagnosis', [2,3], 1)

nodata_categories = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs (number)',
    'STDs:condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:HIV'
]

for category in nodata_categories:
    column_nodata(dane, category)
    
dane = pd.concat([dane, pd.get_dummies(dane['STDs (number)'], prefix='STDs_')],axis=1)
dane.drop(['STDs (number)'],axis=1, inplace=True)

# usunięcie na - opuszczenie obserwacji
num2 = ['Smokes (years)', 'Smokes (packs/year)', 'First sexual intercourse', 'Number of sexual partners']

narows = []
for i in range (len(dane)):
    for j in num2:
        if math.isnan(dane.loc[i, j]) :
            narows.append(i)
            break
            
dane = dane.drop(narows)

dane.index = range(len(dane))


# standaryzacja
numerical = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 
            'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs: Time since first diagnosis',
            'STDs: Time since last diagnosis']

scaler = StandardScaler() 
dane_scaled = scaler.fit_transform(dane[numerical])
d2 = pd.DataFrame(dane_scaled, columns = numerical)
dane[numerical] = d2[numerical]



# usunięcie na - imputacja
imp = dane[[ 'Num of pregnancies', 'Hormonal Contraceptives (years)', 'IUD (years)' ]]
dane[[ 'Num of pregnancies', 'Hormonal Contraceptives (years)', 'IUD (years)' ]] = imp.fillna(0)



            
# stworzenie jednego targetu
targets = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
def has_cancer(row):
    for target in targets:
        if row[target] == 1:
            return 1
    return 0

dane['cancer'] = dane.apply(lambda row: has_cancer(row), axis=1)
dane = dane.drop(targets, axis=1)

# wariant bez kolumn
dane_without = dane.drop(columns=['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'])


# Ujednolicone funkcje dla wszystkich modeli

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# podzial zbioru na treningowy i testowy
def default_split(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=2137)

# scoring
def scoring(y_test, y_predicted):
    print("ACC = ", accuracy_score(y_test, y_predicted))
    print("PREC = ", precision_score(y_test, y_predicted))
    print("RECALL = ", recall_score(y_test, y_predicted))
    print("F1 = ", f1_score(y_test, y_predicted))
    print("FPR = ", roc_auc_score(y_test, y_predicted))

# wyodrebnienie y
def extract_y(data):
    y = data[["cancer"]]
    return data.drop(["cancer"], axis=1), y

# Naive_Bayes

Dane bez kolumn diagnozy

In [4]:
# przygotowanie danych
X, y = extract_y(dane_without)

X_train, X_test, y_train, y_test = default_split(X, y)
print(X.shape, X_train.shape, X_test.shape)

(792, 32) (633, 32) (159, 32)


In [5]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)



ACC =  0.23270440251572327
PREC =  0.10606060606060606
RECALL =  0.7777777777777778
F1 =  0.18666666666666665
FPR =  0.47044917257683216


W kolumnie diagnozy NA -> -1 po standaryzacji

In [6]:
# przygotowanie danych
X, y = extract_y(dane)
X = X.fillna(-1)

X_train, X_test, y_train, y_test = default_split(X, y)
print(X.shape, X_train.shape, X_test.shape)

(792, 34) (633, 34) (159, 34)


In [7]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)



ACC =  0.2389937106918239
PREC =  0.10687022900763359
RECALL =  0.7777777777777778
F1 =  0.1879194630872483
FPR =  0.47399527186761237


W kolumnie diagnozy NA -> -1 przed standaryzacją

In [8]:
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
dane = pd.read_csv('cervical-cancer_csv.csv')

# usuwanie kolumn
dane = dane.drop(['STDs:cervical condylomatosis', 
         'STDs:vaginal condylomatosis', 
         'STDs:pelvic inflammatory disease', 
         'STDs:genital herpes',
         'STDs:molluscum contagiosum',
         'STDs:AIDS',
         'STDs:Hepatitis B',
         'STDs:HPV', 'Dx:CIN'], axis=1)

# uzupełnianie braków i kodowanie zmiennych kategorycznych
def column_nodata(df, column_name):
    df[column_name + "_null"] = df[column_name].apply(lambda x: 1 if pd.isnull(x) else 0)
    df[column_name] = df[column_name].fillna(0)

def replace_in_column(df, column_name, src, dst):
    df[column_name] = df[column_name].replace(to_replace=src, value=dst)

replace_in_column(dane, 'STDs (number)', [3, 4], 2)
replace_in_column(dane, 'STDs: Number of diagnosis', [2,3], 1)

nodata_categories = [
    'Smokes',
    'Hormonal Contraceptives',
    'IUD',
    'STDs',
    'STDs (number)',
    'STDs:condylomatosis',
    'STDs:vulvo-perineal condylomatosis',
    'STDs:syphilis',
    'STDs:HIV'
]

for category in nodata_categories:
    column_nodata(dane, category)
    
dane = pd.concat([dane, pd.get_dummies(dane['STDs (number)'], prefix='STDs_')],axis=1)
dane.drop(['STDs (number)'],axis=1, inplace=True)

# usunięcie na - opuszczenie obserwacji
num2 = ['Smokes (years)', 'Smokes (packs/year)', 'First sexual intercourse', 'Number of sexual partners']

narows = []
for i in range (len(dane)):
    for j in num2:
        if math.isnan(dane.loc[i, j]) :
            narows.append(i)
            break
            
dane = dane.drop(narows)

dane.index = range(len(dane))

imp = dane[['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']]
dane[['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']] = imp.fillna(-1)

# standaryzacja
numerical = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 
            'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs: Time since first diagnosis',
            'STDs: Time since last diagnosis']

scaler = StandardScaler() 
dane_scaled = scaler.fit_transform(dane[numerical])
d2 = pd.DataFrame(dane_scaled, columns = numerical)
dane[numerical] = d2[numerical]



# usunięcie na - imputacja
imp = dane[[ 'Num of pregnancies', 'Hormonal Contraceptives (years)', 'IUD (years)' ]]
dane[[ 'Num of pregnancies', 'Hormonal Contraceptives (years)', 'IUD (years)' ]] = imp.fillna(0)



            
# stworzenie jednego targetu
targets = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
def has_cancer(row):
    for target in targets:
        if row[target] == 1:
            return 1
    return 0

dane['cancer'] = dane.apply(lambda row: has_cancer(row), axis=1)
dane = dane.drop(targets, axis=1)


In [9]:
# przygotowanie danych
X, y = extract_y(dane)

X_train, X_test, y_train, y_test = default_split(X, y)
print(X.shape, X_train.shape, X_test.shape)

(792, 34) (633, 34) (159, 34)


In [10]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
scoring(y_test, y_predicted)



ACC =  0.2389937106918239
PREC =  0.10687022900763359
RECALL =  0.7777777777777778
F1 =  0.1879194630872483
FPR =  0.47399527186761237
