In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
all_df = pd.read_csv('cancer.csv')

# Split the data into training (70%) and testing (30%) sets
train_df, test_df = train_test_split(all_df, test_size=0.3, random_state=42)

# Display the shapes of the two datasets
print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)

Train DataFrame shape: (600, 36)
Test DataFrame shape: (258, 36)


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 28 to 102
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 600 non-null    int64 
 1   Number of sexual partners           600 non-null    object
 2   First sexual intercourse            600 non-null    object
 3   Num of pregnancies                  600 non-null    object
 4   Smokes                              600 non-null    object
 5   Smokes (years)                      600 non-null    object
 6   Smokes (packs/year)                 600 non-null    object
 7   Hormonal Contraceptives             600 non-null    object
 8   Hormonal Contraceptives (years)     600 non-null    object
 9   IUD                                 600 non-null    object
 10  IUD (years)                         600 non-null    object
 11  STDs                                600 non-null    objec

In [4]:
testingColumns = ["Dx:Cancer", "Dx:CIN", "Dx", "Hinselmann", "Schiller", "Citology"]
cancer_df = train_df[train_df['Biopsy'] == True]
nonCancer_df = train_df[train_df['Biopsy'] == False]
    
def getNaiveCounts():
    naive_cancer = {}
    naive_noncancer = {}
    
    for column in testingColumns:
        columnCountCancer = cancer_df[cancer_df[column] == 1]
        naive_cancer[column] = len(columnCountCancer)
        
        columnCountNonCancer = nonCancer_df[nonCancer_df[column] == 1]
        naive_noncancer[column] = len(columnCountNonCancer)
        
        
    return naive_cancer, naive_noncancer

naive_cancer, naive_noncancer = getNaiveCounts()

print(naive_cancer)
print(naive_noncancer)

{'Dx:Cancer': 4, 'Dx:CIN': 3, 'Dx': 6, 'Hinselmann': 16, 'Schiller': 35, 'Citology': 14}
{'Dx:Cancer': 8, 'Dx:CIN': 4, 'Dx': 10, 'Hinselmann': 9, 'Schiller': 18, 'Citology': 22}


In [5]:
priorCancerProbability = len(cancer_df) / len(train_df)
priorCancerProbability

priorNonCancerProbability = len(nonCancer_df) / len(train_df)
priorNonCancerProbability

cancerProbabilities = {}
noncancerProbabilities = {}

def setProbabilities():
    totalCancerOccurances = 0
    for column, count in naive_cancer.items():
        totalCancerOccurances += count
 
    for column, count in naive_cancer.items():
        cancerProbabilities[column] = count / totalCancerOccurances
    
    totalNonCancerOccurances = 0
    for column, count in naive_noncancer.items():
        totalNonCancerOccurances += count
 
    for column, count in naive_noncancer.items():
        noncancerProbabilities[column] = count / totalNonCancerOccurances
    

setProbabilities()
print(cancerProbabilities)
print(noncancerProbabilities)

{'Dx:Cancer': 0.05128205128205128, 'Dx:CIN': 0.038461538461538464, 'Dx': 0.07692307692307693, 'Hinselmann': 0.20512820512820512, 'Schiller': 0.44871794871794873, 'Citology': 0.1794871794871795}
{'Dx:Cancer': 0.11267605633802817, 'Dx:CIN': 0.056338028169014086, 'Dx': 0.14084507042253522, 'Hinselmann': 0.1267605633802817, 'Schiller': 0.2535211267605634, 'Citology': 0.30985915492957744}


In [6]:
def getScore(dataRow, predictCancer):
    if predictCancer:
        calculation = priorCancerProbability
        for column in testingColumns:
            calculation = calculation * pow(cancerProbabilities[column], dataRow[column])
        return calculation
    else:
        calculation = priorNonCancerProbability
        for column in testingColumns:
            calculation = calculation * pow(noncancerProbabilities[column], dataRow[column])
        return calculation

In [7]:
correctPredictions = 0
for i, dataRow in test_df.iterrows():
    prediction = 0
    cancerProbability = getScore(dataRow, True)
    noncancerProbability = getScore(dataRow, False)
    
    if cancerProbability > noncancerProbability:
        prediction = 1
    else:
        prediction = 0
        
    if dataRow['Biopsy'] == prediction:
        correctPredictions += 1
        
print("Test Accuracy: "+str(correctPredictions/len(test_df) * 100))

Test Accuracy: 94.18604651162791


In [8]:
correctPredictions = 0
for i, dataRow in train_df.iterrows():
    prediction = 0
    cancerProbability = getScore(dataRow, True)
    noncancerProbability = getScore(dataRow, False)
    
    if cancerProbability > noncancerProbability:
        prediction = 1
    else:
        prediction = 0
        
    if dataRow['Biopsy'] == prediction:
        correctPredictions += 1
        
print("Train Accuracy: "+str(correctPredictions/len(train_df) * 100))

Train Accuracy: 93.33333333333333
