Reference: Transaction Reduction http://www.ijsrp.org/research-paper-1301/ijsrp-p1397.pdf 

In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import random as rand
import csv
import time
import copy
from itertools import combinations


In [18]:
#dataset test
file = r'E:\School\@Grad School\Data Mining\Project 1\adult.data'
#file.read()

df = pd.read_csv(file, sep=',',
                 names=["Age", "Workclass", "fnlwgt","education", "education_num","marital status","occupation","relationship","Race","Sex","capital_gain","capital_loss","hours_per_week","native country","income"],
                 skipinitialspace=True)
#df.replace(to_replace='?', value=np.nan)
df.head()


Unnamed: 0,Age,Workclass,fnlwgt,education,education_num,marital status,occupation,relationship,Race,Sex,capital_gain,capital_loss,hours_per_week,native country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [19]:
def clean_data(df):
    df= df.applymap(lambda x: np.nan if x == '?' else x)
    df=df.dropna(axis=0)
    df.drop('fnlwgt', axis=1, inplace=True)
    df.drop('education_num', axis=1, inplace=True)
    df['Age'] = pd.cut(df['Age'], [0, 26, 46, 66,100], 
                        labels = ['Young', 'Middle-aged', 'Senior', 'Old'], 
                        right = True, include_lowest = True)
    df['hours_per_week'] = pd.cut(df['hours_per_week'], [0, 25, 40, 60, 100], 
                              labels = ['Part-time', 'Full-time', 'Over-time', 'Too-much'], 
                              right = True, include_lowest = True)
    df['capital_gain'] = pd.cut(df['capital_gain'], [0, 1, 10000000], 
                           labels = ['No-Gain', 'Positive-Gain'], 
                           right = True, include_lowest = True)
    df['capital_loss'] = pd.cut(df['capital_loss'], [0, 1, 10000000],
                            labels = ['No-Loss', 'Positive-Loss'], 
                            right = True, include_lowest = True)


    return df

In [20]:
data = [['M', 'O', 'N', 'K', 'E', 'Y'], 
            ['D', 'O', 'N', 'K', 'E', 'Y'], 
            ['M', 'A', 'K', 'E'],
            ['M', 'U', 'C', 'K', 'Y'],
            ['C', 'O', 'O', 'K', 'I', 'E']]

def open_data():
    
    
    data = clean_data(df)
    dataset= data.values.tolist()
    return dataset
dataset= open_data()
dataset

[['Middle-aged',
  'State-gov',
  'Bachelors',
  'Never-married',
  'Adm-clerical',
  'Not-in-family',
  'White',
  'Male',
  'Positive-Gain',
  'No-Loss',
  'Full-time',
  'United-States',
  '<=50K'],
 ['Senior',
  'Self-emp-not-inc',
  'Bachelors',
  'Married-civ-spouse',
  'Exec-managerial',
  'Husband',
  'White',
  'Male',
  'No-Gain',
  'No-Loss',
  'Part-time',
  'United-States',
  '<=50K'],
 ['Middle-aged',
  'Private',
  'HS-grad',
  'Divorced',
  'Handlers-cleaners',
  'Not-in-family',
  'White',
  'Male',
  'No-Gain',
  'No-Loss',
  'Full-time',
  'United-States',
  '<=50K'],
 ['Senior',
  'Private',
  '11th',
  'Married-civ-spouse',
  'Handlers-cleaners',
  'Husband',
  'Black',
  'Male',
  'No-Gain',
  'No-Loss',
  'Full-time',
  'United-States',
  '<=50K'],
 ['Middle-aged',
  'Private',
  'Bachelors',
  'Married-civ-spouse',
  'Prof-specialty',
  'Wife',
  'Black',
  'Female',
  'No-Gain',
  'No-Loss',
  'Full-time',
  'Cuba',
  '<=50K'],
 ['Middle-aged',
  'Private',
  '

In [21]:
#obtain C1 with support count


def obtain_C1(dataset):
    C1 = {}
    for item in dataset:
        for itemset in item:
            if itemset in C1:
                C1[itemset] += 1
            elif itemset not in C1:
                C1[itemset] = 1
    return C1
            

            
C1=obtain_C1(open_data())
C1

{'Middle-aged': 15721,
 'State-gov': 1279,
 'Bachelors': 5044,
 'Never-married': 9726,
 'Adm-clerical': 3721,
 'Not-in-family': 7726,
 'White': 25933,
 'Male': 20380,
 'Positive-Gain': 2538,
 'No-Loss': 28735,
 'Full-time': 17704,
 'United-States': 27504,
 '<=50K': 22654,
 'Senior': 7299,
 'Self-emp-not-inc': 2499,
 'Married-civ-spouse': 14065,
 'Exec-managerial': 3992,
 'Husband': 12463,
 'No-Gain': 27624,
 'Part-time': 3261,
 'Private': 22286,
 'HS-grad': 9840,
 'Divorced': 4214,
 'Handlers-cleaners': 1350,
 '11th': 1048,
 'Black': 2817,
 'Prof-specialty': 4038,
 'Wife': 1406,
 'Female': 9782,
 'Cuba': 92,
 'Masters': 1627,
 '9th': 455,
 'Married-spouse-absent': 370,
 'Other-service': 3212,
 'Jamaica': 80,
 'Over-time': 8145,
 '>50K': 7508,
 'Some-college': 6678,
 'Too-much': 1052,
 'Asian-Pac-Islander': 895,
 'India': 100,
 'Young': 6413,
 'Own-child': 4466,
 'Assoc-acdm': 1008,
 'Sales': 3584,
 '7th-8th': 557,
 'Transport-moving': 1572,
 'Amer-Indian-Eskimo': 286,
 'Mexico': 610,
 

In [22]:
#frequent 1-itemset
def gen_L1 (min_sup):
    L1 = []
    #delete = []
    num_items = float(len(dataset))
    #print(num_items)
   
    for item in C1:
        support = C1[item]/num_items
        if support >= min_sup:
            L1.append(item)
            for trans in dataset:
                if len(trans) == 1:
                    dataset.remove(trans)
    #for trans in dataset:
        #if not set(L1).issubset(trans):
            #dataset.remove(trans)
    #print(len(dataset))      
    #L1 = {k:v for k, v in C1.items() if v >= min_sup*num_items}
    #print(delete, 'delete')         
    return L1
          
L1=gen_L1(0.6)
L1

['White', 'Male', 'No-Loss', 'United-States', '<=50K', 'No-Gain', 'Private']

In [23]:

#generate Ck
def apriori_gen(L, k):  
    count={}
    Ck = list(combinations(L, k))
    #Ck_ver2 = list(combinations(L, k))
    #Ck = []
    #lenL = len(L) 
    #print(Ck)
    
  
    return Ck

#C2 = apriori_gen(L1,2)
#C2

In [24]:
#find Lk
def prune (dataset, Ck, min_sup,k):
    count={}
        
    for item in dataset:
        for itemset in Ck:
            if set(itemset).issubset(item):
                if itemset in count:
                    count[itemset] += 1
                else:
                    count[itemset] = 1
    #print(count)
    #generate Lk 
    num_items = float(len(dataset))
    #print(len(dt))
    Lk = []
    #copy_count = copy.deepcopy(count)
    for itemset in count:
        support = count[itemset]/num_items
        if support >= min_sup:
            Lk.append(itemset)
            for trans in dataset:
                if len(trans) == 1:
                    dataset.remove(trans)
    #for trans in dataset:
        #if not set(Lk).issubset(trans):
            #dataset.remove(trans)
        
    #print(len(dataset))    
    return Lk
#prune(dataset,C2, .6,2)
#dataset

In [30]:
def apriori(dataset, min_sup=0.7):
    C1 = obtain_C1(dataset)
    L1= gen_L1(min_sup)
    D= dataset
    #Lk=[]
    #count={}
    L =[]
    L.append(L1)
    k=2
    while len(L[k-2]) >0:    
        Ck =  apriori_gen(L1, k)
       
        Lk = prune(D, Ck, min_sup,k)
        #del_value(D, Lk, L[k-2])
        #del_transaction(D, Lk,k)
        
        L.append(Lk)
        
        k+=1
    #return L, support_data
    return L[0:len(L)-1]
                

In [39]:
start = time.time()
apriori(dataset,0.5)
run_time = time.time() -start
run_time

10.935792207717896

In [27]:
apriori(dataset)

[['White', 'No-Loss', 'United-States', '<=50K', 'No-Gain', 'Private'],
 [('White', 'No-Loss'),
  ('White', 'United-States'),
  ('No-Loss', 'United-States'),
  ('No-Loss', '<=50K'),
  ('White', 'No-Gain'),
  ('No-Loss', 'No-Gain'),
  ('United-States', 'No-Gain'),
  ('<=50K', 'No-Gain'),
  ('No-Loss', 'Private')],
 [('White', 'No-Loss', 'United-States'),
  ('White', 'No-Loss', 'No-Gain'),
  ('White', 'United-States', 'No-Gain'),
  ('No-Loss', 'United-States', 'No-Gain')]]