In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os
import numpy as np
import statsmodels.api as sm

In [2]:
# load data
# download data from https://archive.ics.uci.edu/ml/machine-learning-databases/00383/
# cervical cancer dataset
# and the .csv file under your working directory

filepath = os.getcwd()
filename = os.path.join(filepath, "risk_factors_cervical_cancer.csv")

In [3]:
df = pd.read_csv(filename)
df.columns # check columns

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')

In [4]:
# check shape
df.shape

(858, 36)

In [5]:
# check the data in each column
df['Hormonal Contraceptives'].unique() # binary

array(['0.0', '1.0', '?'], dtype=object)

In [6]:
df['Smokes'].unique() # binary

array(['0.0', '1.0', '?'], dtype=object)

In [7]:
df['Num of pregnancies'].unique() # continous

array(['1.0', '4.0', '2.0', '6.0', '3.0', '5.0', '?', '8.0', '7.0', '0.0',
       '11.0', '10.0'], dtype=object)

In [8]:
df['IUD'].unique() # binary

array(['0.0', '1.0', '?'], dtype=object)

In [9]:
df['STDs: Number of diagnosis'].unique() # continous

array([0, 1, 3, 2], dtype=int64)

In [10]:
# clean all "?"
df_clean = df.loc[df['Hormonal Contraceptives']!= '?', :]
df_clean = df_clean.loc[df_clean['Smokes'] != '?',:]
df_clean = df_clean.loc[df_clean['Num of pregnancies'] != '?',:]
df_clean = df_clean.loc[df_clean['IUD']!= '?',:]
df_clean = df_clean.loc[df_clean['STDs: Number of diagnosis'] != '?', :]
df_clean['intercept'] = 1
df_clean.shape

(689, 37)

In [11]:
# We will use the following features: Hormonal contraceptives (y/n), 
# Smokes(y/n), Num. of pregnancies, Num. of diagnosed STDs, and Intrauterine device (y/n)
X = np.array(df_clean[['intercept','Hormonal Contraceptives', 'Smokes','Num of pregnancies','STDs: Number of diagnosis','IUD']],dtype=float)

In [12]:
y = np.array(df_clean['Biopsy'],dtype=float)

In [13]:
clf = LogisticRegression(random_state=0).fit(X, y)

In [14]:
clf.coef_

array([[-1.31724857e-04,  5.29065636e-02,  2.70049974e-01,
         6.66690574e-02,  7.68952491e-01,  5.66191829e-01]])

In [15]:
clf.intercept_

array([-3.03391532])

In [21]:
print(clf.coef_[0,5])

0.5661918289800142


In [22]:
# use statsmodels
log_reg = sm.Logit(y, X).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.241997
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  689
Model:                          Logit   Df Residuals:                      683
Method:                           MLE   Df Model:                            5
Date:                Tue, 28 Dec 2021   Pseudo R-squ.:                 0.02811
Time:                        10:58:13   Log-Likelihood:                -166.74
converged:                       True   LL-Null:                       -171.56
Covariance Type:            nonrobust   LLR p-value:                   0.08593
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.0657      0.357     -8.581      0.000      -3.766      -2.365
x1             0.0642      0.