In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from pprint import pprint

In [43]:
# Load the data
df = pd.read_csv('data2.csv')
df.head(1)

Unnamed: 0,id,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no


In [44]:
# Preprocess:
# 1: find which columns have missing values, remove all rows with missing values:
print("Missing values:")
print(df.isnull().sum())
df = df.dropna()
print("After removing missing values:")
print(df.isnull().sum())

Missing values:
id             0
Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
dtype: int64
After removing missing values:
id             0
Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64


In [45]:
# 2: some values in "tumor-size" and "inv-nodes" might be parsed incoorectly, print unique values to check:
print("Unique values in tuomor-size:")
print(df['tumor-size'].unique())
print("Unique values in inv-nodes:")
print(df['inv-nodes'].unique())

# fix rule:
# tumor-size: '14-Oct' -> '10-14'; '9-May' -> '9';
# inv-nodes: '11-Sep' -> '9-11'; '5-Mar' -> '3-5', '14-Dec' -> '12-14'
df['tumor-size'] = df['tumor-size'].replace('14-Oct', '10-14').replace('9-May', '9')
df['inv-nodes'] = df['inv-nodes'].replace('11-Sep', '9-11').replace('5-Mar', '3-5').replace('14-Dec', '12-14')

print("Unique values in tuomor-size after fix:")
print(df['tumor-size'].unique())
print("Unique values in inv-nodes after fix:")
print(df['inv-nodes'].unique())

Unique values in tuomor-size:
['30-34' '20-24' '15-19' '0-4' '25-29' '50-54' '14-Oct' '40-44' '35-39'
 '9-May' '45-49']
Unique values in inv-nodes:
['0-2' '8-Jun' '11-Sep' '5-Mar' '15-17' '14-Dec' '24-26']
Unique values in tuomor-size after fix:
['30-34' '20-24' '15-19' '0-4' '25-29' '50-54' '10-14' '40-44' '35-39' '9'
 '45-49']
Unique values in inv-nodes after fix:
['0-2' '8-Jun' '9-11' '3-5' '15-17' '12-14' '24-26']


In [46]:
# 3: now convert all string types to int. keep track of the mapping for later use; merge the mappings for all columns
mappings = {}
offset = 0
for col in df.columns:
    if df[col].dtype == "object":
        mapping = {v: k for k, v in enumerate(df[col].unique())}
        mapping = {k: v + offset for k, v in mapping.items()}
        offset += len(mapping)
        df[col] = df[col].map(mapping)
        mapping = {v: f"{col}={k}" for k, v in mapping.items()}  # reverse mapping
        mappings.update(mapping)


print("Mappings:")
pprint(mappings)

print(df.head(10))

Mappings:
{0: 'Class=no-recurrence-events',
 1: 'Class=recurrence-events',
 2: 'age=30-39',
 3: 'age=40-49',
 4: 'age=60-69',
 5: 'age=50-59',
 6: 'age=70-79',
 7: 'age=20-29',
 8: 'menopause=premeno',
 9: 'menopause=ge40',
 10: 'menopause=lt40',
 11: 'tumor-size=30-34',
 12: 'tumor-size=20-24',
 13: 'tumor-size=15-19',
 14: 'tumor-size=0-4',
 15: 'tumor-size=25-29',
 16: 'tumor-size=50-54',
 17: 'tumor-size=10-14',
 18: 'tumor-size=40-44',
 19: 'tumor-size=35-39',
 20: 'tumor-size=9',
 21: 'tumor-size=45-49',
 22: 'inv-nodes=0-2',
 23: 'inv-nodes=8-Jun',
 24: 'inv-nodes=9-11',
 25: 'inv-nodes=3-5',
 26: 'inv-nodes=15-17',
 27: 'inv-nodes=12-14',
 28: 'inv-nodes=24-26',
 29: 'node-caps=no',
 30: 'node-caps=yes',
 31: 'breast=left',
 32: 'breast=right',
 33: 'breast-quad=left_low',
 34: 'breast-quad=right_up',
 35: 'breast-quad=left_up',
 36: 'breast-quad=right_low',
 37: 'breast-quad=central',
 38: 'irradiat=no',
 39: 'irradiat=yes'}
   id  Class  age  menopause  tumor-size  inv-nodes 

In [47]:
# Analysis:
# 1: using Apriori algorithm to find frequent itemsets; threshold = 0.4
# each row is treated as a set {1, 4, 5, 8} that sort.
# we first find 1-itemsets, then 2-itemsets, and so on.

I = list(range(len(mappings)))
F1 = []

# for convience, we convert df to a list of sets:
D = [set(row) for row in df.values]

# find frequent 1-itemsets:
for i in I:
    count = sum([i in row for row in D])
    if count / len(D) >= 0.4:
        F1.append({i})

F_all = [F1]
F_prev = F1
F_cur = []
k = 2
while True:
    F_cur = []
    # generate F_cur from F_prev, this is done by iterating over all pairs of sets in F_prev
    # combine them if they have k-1 elements in common(union result has k elements)
    for i in range(len(F_prev)):
        for j in range(i + 1, len(F_prev)):
            if len(F_prev[i].intersection(F_prev[j])) == k - 2:
                F_cur.append(F_prev[i].union(F_prev[j]))

    def remove_duplicates(lst):
        seen = []
        for item in lst:
            if item not in seen:
                seen.append(item)
        return seen

    F_cur = remove_duplicates(F_cur)

    # now we count the occurences of each set in F_cur
    F_prev = []
    for f in F_cur:
        count = sum([f.issubset(row) for row in D])
        if count / len(D) >= 0.4:
            F_prev.append(f)

    if len(F_prev) == 0:
        break
    k += 1
    F_all.append(F_prev)

print(F_all)

[[{0}, {1}, {2}, {3}, {8}, {9}, {22}, {29}, {31}, {32}, {38}], [{0, 22}, {0, 29}, {0, 38}, {2, 29}, {2, 38}, {3, 29}, {8, 22}, {8, 29}, {8, 38}, {29, 22}, {22, 31}, {38, 22}, {29, 31}, {29, 38}, {38, 31}], [{0, 29, 22}, {0, 38, 22}, {0, 29, 38}, {38, 29, 22}], [{0, 22, 38, 29}]]


In [53]:
# 2: based on these frequent itemsets, find all rules X=>{0} with confidence >= 0.75
Xs = []
confidences = []
elevations = []
target = 0
confidence_target = sum([target in row for row in D]) / len(D)

for F in F_all:
    for f in F:
        X = f - {target}
        if len(X) == 0:
            continue
        if X in Xs:
            continue
        count_X = sum([X.issubset(row) for row in D])
        count_X_target = sum([X.union({target}).issubset(row) for row in D])
        confidence = count_X_target / count_X
        elevation = confidence / confidence_target
        if confidence >= 0.75:
            Xs.append(X)
            confidences.append(confidence)
            elevations.append(elevation)

# print(Xs)
# print([[mappings[i] for i in X] for X in Xs])
# print(confidences)
# print(elevations)

for i in range(len(Xs)):
    print(f"{[mappings[i] for i in Xs[i]]} => {mappings[target]}: confidence={confidences[i]}, elevation={elevations[i]}")

['inv-nodes=0-2'] => Class=no-recurrence-events: confidence=0.7942583732057417, elevation=1.122497802948931
['node-caps=no'] => Class=no-recurrence-events: confidence=0.7737556561085973, elevation=1.0935220241942931
['irradiat=no'] => Class=no-recurrence-events: confidence=0.7627906976744186, elevation=1.0780256288561938
['age=30-39', 'node-caps=no'] => Class=no-recurrence-events: confidence=0.7711864406779662, elevation=1.0898910411622278
['age=30-39', 'irradiat=no'] => Class=no-recurrence-events: confidence=0.7747747747747747, elevation=1.0949623092480236
['node-caps=no', 'inv-nodes=0-2'] => Class=no-recurrence-events: confidence=0.8, elevation=1.1306122448979594
['inv-nodes=0-2', 'breast=left'] => Class=no-recurrence-events: confidence=0.8214285714285714, elevation=1.160896501457726
['irradiat=no', 'inv-nodes=0-2'] => Class=no-recurrence-events: confidence=0.8166666666666667, elevation=1.1541666666666668
['node-caps=no', 'breast=left'] => Class=no-recurrence-events: confidence=0.793