In [142]:
import sys
!{sys.executable} -m pip install mlxtend



In [143]:
import math
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [144]:
# Read the CSV file and store the data in a DataFrame
data = pd.read_csv(r"mammographic_masses.csv", delimiter=",", header=0)

# Question 1

Report 3 rules with support at least 0.2 and confidence at least 0.9. Specifyfor each of them the support and the confidence.

In [145]:
# Preprocessing: change to one hot encoding so as to be able to use apriori from mlxtend
d = data.values.tolist()

In [146]:
# Adding attributes
for i in range(len(d)):
    for j in range (len(d[i])):
        d[i][j] = data.columns[j] + "=" + str(d[i][j])

In [147]:
# Fit the TransactionEncoder to the data and transform it. Learn the unique items in the data, then convert the data into a one-hot encoded array
te = TransactionEncoder()
te_ary = te.fit_transform(d)

In [148]:
df = pd.DataFrame(te_ary, columns=te.columns_)

In [149]:
# Computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

In [150]:
# Generates association rules from frequent itemsets
ar = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)
ar[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
0,(Shape=4),(Density=3),0.37461,0.9
1,"(Density=3, Margin=1)",(BI-RADS=4),0.263267,0.900356
2,"(Severity=0, Margin=1)",(BI-RADS=4),0.299688,0.911392
3,"(Margin=1, BI-RADS=4)",(Severity=0),0.299688,0.911392
4,"(BI-RADS=5, Shape=4)",(Density=3),0.245578,0.904215
5,"(BI-RADS=5, Shape=4)",(Severity=1),0.246618,0.908046
6,"(Severity=1, Shape=4)",(Density=3),0.295525,0.901587
7,"(Severity=0, Density=3, Margin=1)",(BI-RADS=4),0.238293,0.927126
8,"(Density=3, Margin=1, BI-RADS=4)",(Severity=0),0.238293,0.905138
9,"(BI-RADS=5, Severity=1, Shape=4)",(Density=3),0.224766,0.911392


3 rules with support at least 0.2 and confidence at least 0.9:<br>
1. If Shape = 4, Density = 3 (Support = 0.374610, Confidence = 0.900000)<br>
2. If Density = 3 and Margin = 1, BI-RADS = 4 (Support = 0.263267, Confidence = 0.900356)<br>
3. If BI-RADS = 4 and Margin = 1, Severity = 0 (Support = 0.299688, Confidence = 0.911392)<br>

# Question 2

This  task  consists  of  determining  some  attributes  and  their  values  that help  us  to  find  out  whether  a  given  instance  is  benign  (severity = 0)  or malign (severity = 1).  We are looking for rules of the kind:<br>
<center>A1=a1, ... , Ak=ak → Severity=’0’<br>
or<br>
A1=a1, ... , Ak=ak → Severity=’1’<br>
where Ai=ai denotes an attribute and its value.<br></center>
For example the following rule:<br>
<center>Shape=’4’, Margin=’1’ → Severity=’0’<br></center>
tells  us  that  instances  with  the  specified  shape  and  margin  are  usually benign.  Remember that only rules with support at least 0.1, (i.e.  their frequency is at least 10%) are relevant for us.  Rules with lower support are usually not informative, as there is no much evidence they are true or not.  In our exercise we consider relevant any rule with confidence at least 0.9 (i.e.  they are true 90% of times).  Report one or two rules with the specified  requirements  that  you  think  might  help  us  predicting  whether a given instance is benign or malign.  You should not report rules with the attribute BI-RADS for this question.  Which insights did you get from those rules? (e.g. the margin of the lesion can help us determining whether a lesion is benign or malign).

In [151]:
# Drop the "BI-RADS" column
data_drop = data.drop('BI-RADS', axis=1)
data_drop.head()

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67,3,5,3,1
1,43,1,1,?,1
2,58,4,5,3,1
3,28,1,1,3,0
4,74,1,5,?,1


In [152]:
# Preprocessing: change to one hot encoding so as to be able to use apriori from mlxtend
d = data_drop.values.tolist()

In [153]:
# Adding attributes
for i in range(len(d)):
    for j in range (len(d[i])):
        d[i][j] = data_drop.columns[j] + "=" + str(d[i][j])

In [154]:
# Fit the TransactionEncoder to the data and transform it. Learn the unique items in the data, then convert the data into a one-hot encoded array
te = TransactionEncoder()
te_ary = te.fit_transform(d)

In [155]:
df = pd.DataFrame(te_ary, columns=te.columns_)

In [156]:
# Computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

In [157]:
# Generates association rules from frequent itemsets
ar = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

In [158]:
filtered_df = ar[(ar["consequents"] == frozenset({'Severity=0'})) | (ar["consequents"] == frozenset({'Severity=1'}))]
filtered_df[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
5,"(Shape=2, Margin=1)",(Severity=0),0.136316,0.903448
7,"(Density=3, Shape=1, Margin=1)",(Severity=0),0.1436,0.901961


When Shape = 2 and Margin = 1, the given instance is usually a benign  (severity = 0), with support = 0.136316 and confidence = 0.903448.<br>
When Density = 3, Margin = 1 and Shape = 1, the given instance is usually a benign  (severity = 0), with support = 0.143600 and confidence = 0.901961.<br>

# Question 3

As discussed above, the BI-RADS assessment is not always accurate and it might lead to unnecessary breast biopsy. Provide one or two rules that might give some evidence that the BI-RADS assessment is not always accurate. Explain your answer.

In [159]:
# Preprocessing: change to one hot encoding so as to be able to use apriori from mlxtend
d = data.values.tolist()

In [160]:
# Adding attributes
for i in range(len(d)):
    for j in range (len(d[i])):
        d[i][j] = data.columns[j] + "=" + str(d[i][j])

In [161]:
# Fit the TransactionEncoder to the data and transform it. Learn the unique items in the data, then convert the data into a one-hot encoded array
te = TransactionEncoder()
te_ary = te.fit_transform(d)

In [162]:
df = pd.DataFrame(te_ary, columns=te.columns_)

In [163]:
# Computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

In [164]:
# Generates association rules from frequent itemsets
ar = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

In [165]:
filtered_df = ar[(ar["consequents"] == frozenset({'Severity=0'})) | (ar["consequents"] == frozenset({'Severity=1'}))]
filtered_df[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
3,"(Margin=1, BI-RADS=4)",(Severity=0),0.299688,0.911392
5,"(BI-RADS=5, Shape=4)",(Severity=1),0.246618,0.908046
8,"(Density=3, Margin=1, BI-RADS=4)",(Severity=0),0.238293,0.905138
10,"(BI-RADS=5, Shape=4, Density=3)",(Severity=1),0.224766,0.915254


Rules that might give some evidence that the BI-RADS assessment is not always accurate:<br>
1. When Margin = 1 and BI-RADS = 4, the given instance is usually a benign (Severity = 0), with support = 0.299688 and confidence = 0.911392.<br>
2. When Density = 3, Margin = 1 and BI-RADS = 4, the given instance is usually a benign (Severity = 0), with support = 0.238293 and confidence = 0.905138.<br>
<p>From the above two rules, BI-RADS = 4 suggests that the given instance has high probability to be malignant. However, the result is benign. This gives some evidence that the BI-RADS assessment is not always accurate.

# Question 4

Write a script in Python to find the confidence and support of the following rule:  Age=35 -> Severity=0.  Report its support and confidence.  Do you think this rule tells us something valuable or that we should ignore it as there is not enough evidence to support this rule?

In [166]:
# Preprocessing: change to one hot encoding so as to be able to use apriori from mlxtend
d = data.values.tolist()

In [167]:
# Adding attributes
for i in range(len(d)):
    for j in range (len(d[i])):
        d[i][j] = data.columns[j] + "=" + str(d[i][j])

In [168]:
# Fit the TransactionEncoder to the data and transform it. Learn the unique items in the data, then convert the data into a one-hot encoded array
te = TransactionEncoder()
te_ary = te.fit_transform(d)

In [169]:
df = pd.DataFrame(te_ary, columns=te.columns_)

In [170]:
# Computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

In [171]:
# Generates association rules from frequent itemsets
ar = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.0)

In [172]:
filtered_df = ar[(ar["antecedents"] == frozenset({'Age=35'})) & (ar["consequents"] == frozenset({'Severity=0'}))]
filtered_df[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
3,(Age=35),(Severity=0),0.012487,0.923077


This rule: Age=35 -> Severity=0 has support = 0.012487 and confidence = 0.923077.<p>
This means that the rule is supported by only a small proportion of instances in the data set, but when the antecedent (Age=35) is present, the consequent (Severity=0) is present in the majority of cases.<p>

Whether or not this rule is valuable depends on the context and the goals of the analysis. If the goal is to identify the factors that are associated with outcome (Severity=0), then this rule could be considered valuable as it suggests that individuals who are 35 years old are more likely to have Severity=0. However, if the goal is to make accurate predictions about the severity of breast cancer based on age alone, then this rule may not be very useful as it is based on a small number of instances and may not be generalizable to other populations.

# Question 5

The attribute “Age” is ordinal which makes the rule mining approach not ideal. In particular, one would like to obtain rules of the kind<p>
<center>Age ≥ n, A1 = a1, ..., Ak = ak −> Severity=’1’</center><p>
(where n is an integer), as the age is an important factor in determining whether a given instance is malign or benign. However, this issue can be circumvented in our case by modifying the input file (the ’csv’ file) accordingly. Be careful on how you handle the missing values (i.e. those with a ’?’). Provide at least one rule of that kind with support at least 0.1 and confidence at least 0.9.

In [173]:
# Preprocessing: change to one hot encoding so as to be able to use apriori from mlxtend
d = data.values.tolist()
d

[['5', '67', '3', '5', '3', 1],
 ['4', '43', '1', '1', '?', 1],
 ['5', '58', '4', '5', '3', 1],
 ['4', '28', '1', '1', '3', 0],
 ['5', '74', '1', '5', '?', 1],
 ['4', '65', '1', '?', '3', 0],
 ['4', '70', '?', '?', '3', 0],
 ['5', '42', '1', '?', '3', 0],
 ['5', '57', '1', '5', '3', 1],
 ['5', '60', '?', '5', '1', 1],
 ['5', '76', '1', '4', '3', 1],
 ['3', '42', '2', '1', '3', 1],
 ['4', '64', '1', '?', '3', 0],
 ['4', '36', '3', '1', '2', 0],
 ['4', '60', '2', '1', '2', 0],
 ['4', '54', '1', '1', '3', 0],
 ['3', '52', '3', '4', '3', 0],
 ['4', '59', '2', '1', '3', 1],
 ['4', '54', '1', '1', '3', 1],
 ['4', '40', '1', '?', '?', 0],
 ['?', '66', '?', '?', '1', 1],
 ['5', '56', '4', '3', '1', 1],
 ['4', '43', '1', '?', '?', 0],
 ['5', '42', '4', '4', '3', 1],
 ['4', '59', '2', '4', '3', 1],
 ['5', '75', '4', '5', '3', 1],
 ['2', '66', '1', '1', '?', 0],
 ['5', '63', '3', '?', '3', 0],
 ['5', '45', '4', '5', '3', 1],
 ['5', '55', '4', '4', '3', 0],
 ['4', '46', '1', '5', '2', 0],
 ['5', '

In [174]:
# Set the age for finding support and confidence
age = 40

In [175]:
# Adding attributes
for i in range(len(d)):
    for j in range(len(d[i])):
        if data.columns[j] == 'Age' and str(d[i][j]) != '?':
            if int(d[i][j]) >= age:
                d[i][j] = data.columns[j] + ">=" + str(age)
            else:
                d[i][j] = data.columns[j] + "<" + str(age)
        else:
            d[i][j] = data.columns[j] + "=" + str(d[i][j])

In [176]:
# Fit the TransactionEncoder to the data and transform it. Learn the unique items in the data, then convert the data into a one-hot encoded array
te = TransactionEncoder()
te_ary = te.fit_transform(d)

In [177]:
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Age<40,Age=?,Age>=40,BI-RADS=0,BI-RADS=2,BI-RADS=3,BI-RADS=4,BI-RADS=5,BI-RADS=6,BI-RADS=?,...,Margin=4,Margin=5,Margin=?,Severity=0,Severity=1,Shape=1,Shape=2,Shape=3,Shape=4,Shape=?
0,False,False,True,False,False,False,False,True,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,True,False,False,False,True,False,False,False,...,False,False,False,False,True,True,False,False,False,False
2,False,False,True,False,False,False,False,True,False,False,...,False,True,False,False,True,False,False,False,True,False
3,True,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,True,False,False,False,False
4,False,False,True,False,False,False,False,True,False,False,...,False,True,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,False,False,True,False,False,False,True,False,False,False,...,False,False,False,True,False,False,True,False,False,False
957,False,False,True,False,False,False,True,False,False,False,...,False,True,False,False,True,False,False,False,True,False
958,False,False,True,False,False,False,True,False,False,False,...,False,True,False,True,False,False,False,False,True,False
959,False,False,True,False,False,False,False,True,False,False,...,False,True,False,False,True,False,False,False,True,False


In [178]:
# Computing frequent itemsets and association rules
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)

In [179]:
# Generates association rules from frequent itemsets
ar = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.9)

In [180]:
filtered_df = ar[(ar["consequents"] == frozenset({'Severity=1'}))]
filtered_df[["antecedents","consequents","support","confidence"]]

Unnamed: 0,antecedents,consequents,support,confidence
34,"(BI-RADS=5, Shape=4)",(Severity=1),0.246618,0.908046
51,"(BI-RADS=5, Shape=4, Age>=40)",(Severity=1),0.236212,0.908
74,"(BI-RADS=5, Density=3, Margin=4)",(Severity=1),0.121748,0.906977
76,"(BI-RADS=5, Shape=4, Density=3)",(Severity=1),0.224766,0.915254
77,"(BI-RADS=5, Shape=4, Margin=4)",(Severity=1),0.109261,0.913043
84,"(BI-RADS=5, Density=3, Margin=4, Age>=40)",(Severity=1),0.115505,0.909836
86,"(BI-RADS=5, Density=3, Shape=4, Age>=40)",(Severity=1),0.218522,0.917031
89,"(BI-RADS=5, Shape=4, Margin=4, Age>=40)",(Severity=1),0.103018,0.916667


In this case, we set the age threshold above 40. There are some rules with support at least 0.1 and confidence at least 0.9.<br>
1. When BI-RADS=5 and Shape=4, the given instance is usually a malignant (Severity = 1), with support = 0.236212 and confidence = 0.908000.<br>
2. When BI-RADS=5, Density=3 and Margin=4, the given instance is usually a malignant (Severity = 1), with support = 0.115505 and confidence = 0.909836.<br>
3. When BI-RADS=5, Density=3 and Shape=4, the given instance is usually a malignant (Severity = 1), with support = 0.218522 and confidence = 0.917031.<br>
4. When BI-RADS=5, Shape=4 and Margin=4, the given instance is usually a malignant (Severity = 1), with support = 0.103018 and confidence = 0.916667.

Repeat by setting different age threshold to see other result.