In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('project_data.csv', delimiter=';')
continuous_vars = ['X02', 'X05','X08','X11','X16','X18','X13']
df=df.drop(continuous_vars, axis=1)
df.head()

Unnamed: 0,X01,X03,X04,X06,X07,X09,X10,X12,X14,X15,X17,X19,X20,Y
0,A11,A34,A43,A65,A75,A93,A101,A121,A143,A152,A173,A192,A201,1
1,A12,A32,A43,A61,A73,A92,A101,A121,A143,A152,A173,A191,A201,2
2,A14,A34,A46,A61,A74,A93,A101,A121,A143,A152,A172,A191,A201,1
3,A11,A32,A42,A61,A74,A93,A103,A122,A143,A153,A173,A191,A201,1
4,A11,A33,A40,A61,A73,A93,A101,A124,A143,A153,A173,A191,A201,2


In [3]:
df['Y']=df['Y'].replace(1, 0)
df['Y']=df['Y'].replace(2, 1)# bad customers will have 1 here in Y column so that it appears in a rule
#(because rules only detect the true value)
df = pd.get_dummies(df)
df.shape

(1000, 55)

In [4]:
from mlxtend.frequent_patterns import apriori,association_rules

#itemsets with at least 60% support:
frequent=apriori(df, min_support = 0.6, use_colnames = True)
frequent.sort_values("support", ascending = False)

Unnamed: 0,support,itemsets
5,0.963,(X20_A201)
1,0.907,(X10_A101)
8,0.88,"(X20_A201, X10_A101)"
2,0.814,(X14_A143)
9,0.782,"(X20_A201, X14_A143)"
6,0.742,"(X14_A143, X10_A101)"
12,0.718,"(X20_A201, X14_A143, X10_A101)"
3,0.713,(X15_A152)
10,0.685,"(X20_A201, X15_A152)"
7,0.647,"(X15_A152, X10_A101)"


In [5]:
len(frequent)

14

In [15]:
frequent = apriori(df, min_support = 0.1, use_colnames=True)
frequent['length'] = frequent['itemsets'].apply(lambda x: len(x))
len(frequent)

2177

In [16]:
# getting the item sets with length = 2 and support at least 20%
frequent=frequent[ (frequent['length'] >= 2) &(frequent['support'] >= 0.2) ]
frequent

Unnamed: 0,support,itemsets,length
38,0.217,"(Y, X06_A61)",2
42,0.272,"(Y, X10_A101)",2
44,0.224,"(Y, X14_A143)",2
49,0.296,"(Y, X20_A201)",2
51,0.219,"(X01_A11, X06_A61)",2
...,...,...,...
2071,0.207,"(X10_A101, X20_A201, X19_A192, X14_A143, X15_A...",5
2073,0.290,"(X10_A101, X19_A191, X20_A201, X17_A173, X14_A...",5
2076,0.242,"(X10_A101, X19_A191, X20_A201, X17_A173, X15_A...",5
2085,0.224,"(X19_A191, X20_A201, X17_A173, X14_A143, X15_A...",5


In [17]:
frequent=frequent[ (frequent['length'] == 2) &(frequent['support'] >= 0.2) ]
len(frequent[ (frequent['length'] == 2) &(frequent['support'] >= 0.2) ])

97

### Association rules mining

#### Now we extract association rules
#### To help in the classification task, we may filter rules which have the Y target as a consequent

In [9]:
frequent=apriori(df, min_support = 0.1, use_colnames = True)
#frequent['length'] = frequent['itemsets'].apply(lambda x: len(x))
frequent.sort_values(['support'], ascending =[False]).head()

Unnamed: 0,support,itemsets
34,0.963,(X20_A201)
19,0.907,(X10_A101)
232,0.88,"(X20_A201, X10_A101)"
25,0.814,(X14_A143)
260,0.782,"(X20_A201, X14_A143)"


In [10]:
rules=association_rules(frequent, metric ="lift", min_threshold =1.5)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
84,"(X10_A101, X15_A153)","(X20_A201, X12_A124)",0.103,0.152,0.100,0.970874,6.387328,0.084344,29.114667
22,"(X10_A101, X15_A153)",(X12_A124),0.103,0.154,0.100,0.970874,6.304375,0.084138,29.046000
80,"(X20_A201, X10_A101, X15_A153)",(X12_A124),0.103,0.154,0.100,0.970874,6.304375,0.084138,29.046000
33,(X15_A153),"(X20_A201, X12_A124)",0.108,0.152,0.104,0.962963,6.335283,0.087584,22.896000
3,(X15_A153),(X12_A124),0.108,0.154,0.104,0.962963,6.253006,0.087368,22.842000
...,...,...,...,...,...,...,...,...,...
175,"(X17_A173, X15_A152, X10_A101)","(X20_A201, X12_A123, X09_A93)",0.412,0.177,0.114,0.276699,1.563271,0.041076,1.137839
146,"(X20_A201, X17_A173, X15_A152)","(X12_A123, X09_A93)",0.438,0.177,0.117,0.267123,1.509171,0.039474,1.122972
171,(X01_A14),"(X10_A101, X20_A201, X03_A34, X14_A143, X15_A152)",0.394,0.174,0.104,0.263959,1.517008,0.035444,1.122221
177,"(X20_A201, X17_A173, X15_A152)","(X12_A123, X10_A101, X09_A93)",0.438,0.170,0.114,0.260274,1.531023,0.039540,1.122037


In [11]:
rules=rules[ (rules['confidence'] >= 0.9) ]
rules = rules.sort_values(by=['confidence', 'lift'], ascending =[False, False])

In [12]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
84,"(X10_A101, X15_A153)","(X20_A201, X12_A124)",0.103,0.152,0.1,0.970874,6.387328,0.084344,29.114667
22,"(X10_A101, X15_A153)",(X12_A124),0.103,0.154,0.1,0.970874,6.304375,0.084138,29.046
80,"(X20_A201, X10_A101, X15_A153)",(X12_A124),0.103,0.154,0.1,0.970874,6.304375,0.084138,29.046
33,(X15_A153),"(X20_A201, X12_A124)",0.108,0.152,0.104,0.962963,6.335283,0.087584,22.896
3,(X15_A153),(X12_A124),0.108,0.154,0.104,0.962963,6.253006,0.087368,22.842


In [13]:
ind =  []
for i in range(len(rules)):
    if('Y' in (str(rules['consequents'].iloc[i]))):
        ind.append(i)

In [14]:
for i in ind:
    print(rules.iloc[[i]])