In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn import cluster
from sklearn.preprocessing import LabelBinarizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectKBest
sns.set(style='darkgrid')
pd.options.display.max_columns = None

In [2]:
train = pd.read_csv('./aps_failure_training_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')
test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,keep_default_na=False, na_values='na')

aps = pd.concat([train, test])
aps = aps.dropna()

In [3]:
aps['class'].value_counts()

neg    650
pos    106
Name: class, dtype: int64

In [4]:
def getKBest(X, y, score_func=f_classif, k=10):
    k_best = SelectKBest(score_func=score_func, k=k).fit(X, y)

    idxs = k_best.get_support(indices=True)
    X = X.iloc[:,idxs]
    return X

In [5]:
X, y = aps.drop(columns=['class']), aps['class']
X = getKBest(X, y)

X.head()

Unnamed: 0,al_000,am_0,aq_000,bb_000,bj_000,bu_000,bv_000,cj_000,cq_000,dn_000
16,38582.0,60878.0,530234.0,12111862.0,1671484.0,12111862.0,12111862.0,3987438.72,12111862.0,82844.0
179,34126.0,50964.0,695194.0,11080752.0,1029202.0,11080752.0,11080752.0,2717504.64,11080752.0,37806.0
225,231038.0,322998.0,630724.0,10089184.0,524078.0,10089184.0,10089184.0,18.24,10089184.0,50700.0
394,3166680.0,6330452.0,4032312.0,27646704.0,8125930.0,27646704.0,27646704.0,5166424.32,27646704.0,298434.0
413,728878.0,2036492.0,7723186.0,33186736.0,14710494.0,33186736.0,33186736.0,18649909.44,33186736.0,481666.0


In [6]:
def discretize_eqwidth(df, bins=3):
    for col in list(df):
        intervals = pd.cut(df[col], bins)
        df[col] = pd.cut(df[col], bins, labels=[str(i+1) for i in range(bins)])
        
        attrs = []
        values = intervals.unique().tolist()
        values.sort()

        for val in values:
            low, high = val.left, val.right
            attrs.append('{}:[{}, {}]'.format(col, low, high))

        lb = LabelBinarizer().fit_transform(df[col])
        if (len(attrs) == 2):
            v = list(map(lambda x: 1 - x, lb))
            lb = np.concatenate((lb, v), 1)

        df2 = pd.DataFrame(data=lb, columns=attrs)
        df2 = df2.reset_index(drop=True)
        df = df.reset_index(drop=True)
        df = df.drop(columns=[col])
        df = pd.concat([df, df2], axis=1)
    
    return df

def discretize_frequency(df, bins=3):
    for col in list(df):
        intervals = pd.qcut(df[col], bins, duplicates='drop')
        inte = intervals.value_counts().shape[0]
        df[col] = pd.qcut(df[col], bins, labels=[str(i+1) for i in range(inte)], duplicates='drop')
        attrs = []
        values = intervals.unique().tolist()
        values.sort()

        for val in values:
            low, high = val.left, val.right
            attrs.append('{}:[{}, {}]'.format(col, low, high))

        lb = LabelBinarizer().fit_transform(df[col])
        if (len(attrs) == 2):
            v = list(map(lambda x: 1 - x, lb))
            lb = np.concatenate((lb, v), 1)

        df2 = pd.DataFrame(data=lb, columns=attrs)
        df2 = df2.reset_index(drop=True)
        df = df.reset_index(drop=True)
        df = df.drop(columns=[col])
        df = pd.concat([df, df2], axis=1)
    
    return df
    

In [7]:
def getRules(X, sup, confidence=0.9, top=15):
    frequent_itemsets = apriori(X, min_support=sup, use_colnames=True)
    if frequent_itemsets.shape[0] == 0:
        return 0, 0, 0, 0
    
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence)
    num_rules = rules.shape[0]
    mean_lift = rules['lift'].mean()
    if frequent_itemsets.shape[0] > top:
        
        mean_lift_top = rules['lift'].sort_values(ascending=False)[:top].mean()
    else:
        mean_lift_top = mean_lift
        
    return rules, num_rules, mean_lift, mean_lift_top
    

In [8]:
def getTransformations(df, width_bins=[5,7], frequency=[3]):
    datasets = []
    
    X, y = df.drop(columns=['class']), df['class']
    X = getKBest(X, y)
    
    for bins in width_bins:
        X_disc = discretize_eqwidth(X.copy(deep=True), bins=bins)
        datasets.append((X_disc, 'Equal Width {}'.format(bins)))
        
    for bins in frequency:
        X_disc = discretize_frequency(X.copy(deep=True), bins=bins)
        datasets.append((X_disc, 'Equal Frequency {}'.format(bins)))
        
    return datasets

In [9]:
aps_datasets = getTransformations(aps)

In [10]:
aps_datasets

[(     al_000:[-11326.014, 2298166.8]  al_000:[2298166.8, 4596169.6]  \
  0                                 1                              0   
  1                                 1                              0   
  2                                 1                              0   
  3                                 0                              1   
  4                                 1                              0   
  5                                 1                              0   
  6                                 1                              0   
  7                                 1                              0   
  8                                 1                              0   
  9                                 1                              0   
  10                                0                              1   
  11                                1                              0   
  12                                1                           

In [16]:
def evaluateSupport(datasets, filename='aps'):
    measures = {}
    i = 0
    for df, name in datasets:
        supports = np.arange(0.02, 0.3, 0.025)
        for sup in supports:
            _, num_rules, mean_lift, mean_lift_top = getRules(df, sup, top=20)
            measures[i] = {'Discretization': name, 'Number of Rules': num_rules, 'Mean Lift': mean_lift, 'Top 20 Mean Lift': mean_lift_top, 'Support': sup}
            i += 1
            
    plt.figure(figsize=(3,2))
    measures = pd.DataFrame.from_dict(measures, "index")
    measures.to_csv('plot_data/assoc_rules/{}_arules.csv'.format(filename), index=False)
    first = True
    for measure, fplotname in [('Number of Rules', 'nrules'), ('Mean Lift', 'mean_lift'), ('Top 20 Mean Lift', 'top_mean_lift')]:
        
        g = sns.FacetGrid(measures, hue='Discretization', size=4)
        if first:
            first = not first
            g = g.map(plt.scatter, 'Support', measure).add_legend()
        else:
            g = g.map(plt.scatter, 'Support', measure)
            
        g = g.map(plt.plot, 'Support', measure)
        
        g.axes[0,0].set_ylim(ymin=0)
        plt.savefig('images/assoc_rules/{}_arules_{}.pdf'.format(filename, fplotname))
        plt.clf()

In [17]:
evaluateSupport(aps_datasets)

<Figure size 216x144 with 0 Axes>

<Figure size 430.6x288 with 0 Axes>

<Figure size 288x288 with 0 Axes>

<Figure size 288x288 with 0 Axes>