# Exploring the Data

Necessary libraries will be loaded in below cell (as well as the raw data), and prints out the first record to take a look at it. One thing to notice is that, the last 4 columns are our target variables. They're different kinds of cancer diagnostic methods, unlike some other classification tasks that contain only one target variable, this one gets 4, all other columns are features describing each data point.

In [166]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames

%matplotlib inline

# Load the diagnostic dataset, missing value in the raw data has been marked with "?"
data = pd.read_csv("kag_risk_factors_cervical_cancer.csv", na_values='?')

# Display the first record
display(data.head(n=1))

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0


In [167]:
# Number of records in the dataset
n_records = np.shape(data)[0]

# Numbers of records where individual's result is negative with respect to all 4 target variables
n_Hinse_neg = (data.Hinselmann == 0).sum()
n_Schl_neg = (data.Schiller == 0).sum()
n_Cito_neg = (data.Citology == 0).sum()
n_Biop_neg = (data.Biopsy == 0).sum()

# Numbers of records where individual's result is positive with respect to all 4 target variables
n_Hinse_pos = (data.Hinselmann == 1).sum()
n_Schl_pos = (data.Schiller == 1).sum()
n_Cito_pos = (data.Citology == 1).sum()
n_Biop_pos = (data.Biopsy == 1).sum()

# Percentage of individuals whose test results are positive with respect to all 4 target variables
Per_Hinse_pos = 100.0*n_Hinse_pos / (n_Hinse_pos + n_Hinse_neg)
Per_Schl_pos = 100.0*n_Schl_pos / (n_Schl_pos + n_Schl_neg)
Per_Cito_pos = 100.0*n_Cito_pos / (n_Cito_pos + n_Cito_neg)
Per_Biop_pos = 100.0*n_Biop_pos / (n_Biop_pos + n_Biop_neg)

# Print out results
print("Number of records: " + str(n_records))
print("Percentage of individuals whose Hinselmann test result is positive: {:.2f}%".format(Per_Hinse_pos))
print("Percentage of individuals whose Schiller test result is positive: {:.2f}%".format(Per_Schl_pos))
print("Percentage of individuals whose Citology test result is positive: {:.2f}%".format(Per_Cito_pos))
print("Percentage of individuals whose Biopsy test result is positive: {:.2f}%".format(Per_Biop_pos))

Number of records: 858
Percentage of individuals whose Hinselmann test result is positive: 4.08%
Percentage of individuals whose Schiller test result is positive: 8.62%
Percentage of individuals whose Citology test result is positive: 5.13%
Percentage of individuals whose Biopsy test result is positive: 6.41%


In [168]:
# Get a statistical description of the original dataset
display(data.describe())

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,858.0,832.0,851.0,802.0,845.0,845.0,845.0,750.0,750.0,741.0,...,71.0,71.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
mean,26.820513,2.527644,16.9953,2.275561,0.145562,1.219721,0.453144,0.641333,2.256419,0.112011,...,6.140845,5.816901,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,1.66776,2.803355,1.447414,0.352876,4.089017,2.22661,0.479929,3.764254,0.315593,...,5.895024,5.755271,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,2.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,2.0,17.0,2.0,0.0,0.0,0.0,1.0,0.5,0.0,...,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,3.0,18.0,3.0,0.0,0.0,0.0,1.0,3.0,0.0,...,8.0,7.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,28.0,32.0,11.0,1.0,37.0,37.0,1.0,30.0,1.0,...,22.0,22.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [169]:
data.skew(axis=0)

Age                                    1.394279
Number of sexual partners              5.454649
First sexual intercourse               1.564375
Num of pregnancies                     1.423514
Smokes                                 2.013621
Smokes (years)                         4.465484
Smokes (packs/year)                    9.308806
Hormonal Contraceptives               -0.590551
Hormonal Contraceptives (years)        2.626438
IUD                                    2.465451
IUD (years)                            5.001759
STDs                                   2.583687
STDs (number)                          3.402849
STDs:condylomatosis                    3.772582
STDs:cervical condylomatosis           0.000000
STDs:vaginal condylomatosis           13.638036
STDs:vulvo-perineal condylomatosis     3.824978
STDs:syphilis                          6.246054
STDs:pelvic inflammatory disease      27.440845
STDs:genital herpes                   27.440845
STDs:molluscum contagiosum            27

# Preparing the Data
We can know from above statistics that those continuous variables in our data are not so skewed, which means we don't need to worry too much about the algorithms to be sensitive to skewed distributions. But features of the data set do contain missing values and need to be taken care of, we'll start by looking at the amount of NaN in each of variabls(if any) in our data set.

In [170]:
# Take a look at the amount of NaN for each feature
data.isnull().sum()

Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                  

As for continuous variables(e.g. Smokes (years)), we'll use median value to impute missing value. And for categorical variables(e.g. Smokes), we'll treat missing value as an addtional type and use a number to denote it(e.g. there might be 4 possible values in a feature(1,2,3,4), then 5 would be used for imputing missing values in that feature) 

What's more, 2 variables ('STDs: Time since first diagnosis' and 'STDs: Time since last diagnosis') will be removed since they suffer from a great deal of missing values and they're valueless to our analysis.

In [171]:
# Split the data into features and target label
tar_var_raw = data[['Hinselmann','Schiller','Citology','Biopsy']]
features_raw = data.drop(['Hinselmann','Schiller','Citology','Biopsy'], axis = 1)

# Confirm the split is correct
features_raw.head(n=1)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,,,0,0,0,0


In [172]:
# Drop those 2 useless variables
# processing_data = features_raw.copy()
features_raw.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True)

In [173]:
# Impute missing values for boolean variables
#qwe = np.unique(features_raw[])
bool_var = ['Smokes', 'Hormonal Contraceptives','IUD','STDs','STDs:condylomatosis','STDs:cervical condylomatosis','STDs:vaginal condylomatosis','STDs:vulvo-perineal condylomatosis','STDs:syphilis','STDs:pelvic inflammatory disease','STDs:genital herpes','STDs:molluscum contagiosum','STDs:AIDS','STDs:HIV','STDs:Hepatitis B','STDs:HPV']
features_raw[bool_var] = features_raw[bool_var].replace(np.nan, 2, regex=True)

In [176]:
# Use median to impute continuous missing data
Continuous_var = ['Number of sexual partners','First sexual intercourse','Num of pregnancies','Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)','STDs (number)']

for var in Continuous_var:
    features_raw[var].fillna(features_raw[var].median(), inplace=True) 

# Check whether there's any missing value left
features_raw.isnull().sum()

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


In [177]:
features_raw.head(n=2)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


### Shuffle and Split Data