In [1]:
# PrefixSpan

import sys
import pandas as pd
import numpy as np

PLACE_HOLDER = '_'

def input():

    # file input
    data = pd.read_csv('trade.csv', usecols=['uid', 'vipno', 'sldat', 'dptno'])
    data['timestamp'] = pd.to_datetime(data['sldat'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldat', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['dptno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['dptno'])

    return data_set.fillna(0)

def input_new():

    # file input
    data = pd.read_csv('trade_new.csv', usecols=['uid', 'vipno', 'sldatime', 'dptno'])
    data['timestamp'] = pd.to_datetime(data['sldatime'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldatime', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['dptno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['dptno'])

    return data_set.fillna(0)

def createInitSet(data_set):  

    # clear all 0
    data_array = data_set.as_matrix()
    data_dok = []
    for row in data_array:
        s = []
        s.append([str(x) for x in row if x != 0.0])
        data_dok.append(s)

    return data_dok

In [2]:
class SquencePattern:
    def __init__(self, squence, support):
        self.squence = []
        for s in squence:
            self.squence.append(list(s))
        self.support = support

    def append(self, p):
        if p.squence[0][0] == PLACE_HOLDER:
            first_e = p.squence[0]
            first_e.remove(PLACE_HOLDER)
            self.squence[-1].extend(first_e)
            self.squence.extend(p.squence[1:])
        else:
            self.squence.extend(p.squence)
        self.support = min(self.support, p.support)


def prefixSpan(pattern, S, threshold):
    patterns = []
    f_list = frequent_items(S, pattern, threshold)
	
    for i in f_list:
        p = SquencePattern(pattern.squence, pattern.support)
        p.append(i)
        patterns.append(p)
        
        
        p_S = build_projected_database(S, p)
        p_patterns = prefixSpan(p, p_S, threshold)
        patterns.extend(p_patterns)
    return patterns


def frequent_items(S, pattern, threshold):
    items = {}
    _items = {}
    f_list = []
    if S is None or len(S) == 0:
        return []

    if len(pattern.squence) != 0:
        last_e = pattern.squence[-1]
    else:
        last_e = []
    for s in S:
        #class 1
        is_prefix = True
        for item in last_e:
            if item not in s[0]:
                is_prefix = False
                break
        if is_prefix and len(last_e) > 0:
            index = s[0].index(last_e[-1])
            if index < len(s[0]) - 1:
                for item in s[0][index + 1:]:
                    if item in _items:
                        _items[item] += 1
                    else:
                        _items[item] = 1

        #class 2
        if PLACE_HOLDER in s[0]:
            for item in s[0][1:]:
                if item in _items:
                    _items[item] += 1
                else:
                    _items[item] = 1
            s = s[1:]

        #class 3
        counted = []
        for element in s:
            for item in element:
                if item not in counted:
                    counted.append(item)
                    if item in items:
                        items[item] += 1
                    else:
                        items[item] = 1

    f_list.extend([SquencePattern([[PLACE_HOLDER, k]], v)
                    for k, v in _items.iteritems()
                    if v >= threshold])
    f_list.extend([SquencePattern([[k]], v)
                   for k, v in items.iteritems()
                   if v >= threshold])
    sorted_list = sorted(f_list, key=lambda p: p.support)
    return sorted_list  
    


def build_projected_database(S, pattern):
    """
    suppose S is projected database base on pattern's prefix,
    so we only need to use the last element in pattern to
    build projected database
    """
    p_S = []
    last_e = pattern.squence[-1]
    last_item = last_e[-1]
    for s in S:
        p_s = []
        for element in s:
            is_prefix = False
            if PLACE_HOLDER in element:
                if last_item in element and len(pattern.squence[-1]) > 1:
                    is_prefix = True
            else:
                is_prefix = True
                for item in last_e:
                    if item not in element:
                        is_prefix = False
                        break

            if is_prefix:
                e_index = s.index(element)
                i_index = element.index(last_item)
                if i_index == len(element) - 1:
                    p_s = s[e_index + 1:]
                else:
                    p_s = s[e_index:]
                    index = element.index(last_item)
                    e = element[i_index:]
                    e[0] = PLACE_HOLDER
                    p_s[0] = e
                break
        if len(p_s) != 0:
            p_S.append(p_s)

    return p_S


def print_patterns(patterns):
    for p in patterns:
        print("pattern:{0}, support:{1}".format(p.squence, p.support))



In [3]:
if __name__ == "__main__":
    
    S = createInitSet(input())

    for min_support in [2,4,8,16,32,64]:
        print "\nmin_support = %d: "%min_support
        patterns = prefixSpan(SquencePattern([], sys.maxint), S, min_support)
        print_patterns(patterns)

    S = createInitSet(input_new())

    for min_support in [2,4,8,16,32,64]:
        print "\nmin_support = %d: "%min_support
        patterns = prefixSpan(SquencePattern([], sys.maxint), S, min_support)
        print_patterns(patterns)


min_support = 2: 
pattern:[['14570.0']], support:2
pattern:[['14644.0']], support:2
pattern:[['22501.0']], support:2
pattern:[['30803.0']], support:2
pattern:[['15021.0']], support:2
pattern:[['14838.0']], support:2
pattern:[['14932.0']], support:2
pattern:[['14141.0']], support:2
pattern:[['14141.0', '30380.0']], support:2
pattern:[['11056.0']], support:2
pattern:[['10100.0']], support:2
pattern:[['10100.0', '22008.0']], support:2
pattern:[['14101.0']], support:2
pattern:[['11044.0']], support:2
pattern:[['14286.0']], support:2
pattern:[['22851.0']], support:2
pattern:[['14772.0']], support:2
pattern:[['14772.0', '25120.0']], support:2
pattern:[['14074.0']], support:2
pattern:[['22713.0']], support:2
pattern:[['22511.0']], support:2
pattern:[['21700.0']], support:2
pattern:[['10114.0']], support:2
pattern:[['21012.0']], support:2
pattern:[['14596.0']], support:2
pattern:[['32403.0']], support:2
pattern:[['21000.0']], support:2
pattern:[['11210.0']], support:2
pattern:[['34120.0']], s

pattern:[['10150.0']], support:26
pattern:[['10150.0', '10439.0']], support:2
pattern:[['10150.0', '10439.0', '22008.0']], support:2
pattern:[['10150.0', '10439.0', '22008.0', '24010.0']], support:2
pattern:[['10150.0', '10439.0', '22008.0', '23113.0']], support:2
pattern:[['10150.0', '10439.0', '22008.0', '23113.0', '24010.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '22008.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '22008.0', '24010.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '22008.0', '23113.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '22008.0', '23113.0', '24010.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '24010.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '15502.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '15502.0', '22008.0']], support:2
pattern:[['10150.0', '10439.0', '15113.0', '15502.0', '22008.0', '24010.0']], support:

pattern:[['15110.0', '15200.0', '22102.0']], support:2
pattern:[['15110.0', '15200.0', '22102.0', '23110.0']], support:2
pattern:[['15110.0', '15200.0', '22102.0', '23110.0', '23113.0']], support:2
pattern:[['15110.0', '15200.0', '22102.0', '23113.0']], support:2
pattern:[['15110.0', '15200.0', '22100.0']], support:2
pattern:[['15110.0', '15200.0', '23113.0']], support:2
pattern:[['15110.0', '15200.0', '23110.0']], support:3
pattern:[['15110.0', '15200.0', '23110.0', '23132.0']], support:2
pattern:[['15110.0', '15200.0', '23110.0', '23113.0']], support:2
pattern:[['15110.0', '15200.0', '22008.0']], support:3
pattern:[['15110.0', '15200.0', '30380.0']], support:3
pattern:[['15110.0', '15200.0', '23132.0']], support:3
pattern:[['15110.0', '27002.0']], support:11
pattern:[['15110.0', '27002.0', '27300.0']], support:2
pattern:[['15110.0', '27002.0', '30380.0']], support:2
pattern:[['15110.0', '23113.0']], support:12
pattern:[['15110.0', '23113.0', '23132.0']], support:2
pattern:[['15110.0'

pattern:[['14834.0']], support:4
pattern:[['14072.0']], support:4
pattern:[['14836.0']], support:4
pattern:[['15116.0']], support:4
pattern:[['22604.0']], support:4
pattern:[['14700.0']], support:4
pattern:[['10136.0']], support:4
pattern:[['14811.0']], support:4
pattern:[['30323.0']], support:4
pattern:[['15104.0']], support:4
pattern:[['24111.0']], support:4
pattern:[['15433.0']], support:4
pattern:[['34511.0']], support:4
pattern:[['14861.0']], support:4
pattern:[['15022.0']], support:4
pattern:[['10008.0']], support:4
pattern:[['14841.0']], support:4
pattern:[['15012.0']], support:4
pattern:[['14839.0']], support:4
pattern:[['22840.0']], support:4
pattern:[['14112.0']], support:4
pattern:[['30320.0']], support:4
pattern:[['15111.0']], support:4
pattern:[['14243.0']], support:4
pattern:[['32143.0']], support:4
pattern:[['25405.0']], support:4
pattern:[['34121.0']], support:4
pattern:[['15029.0']], support:4
pattern:[['14636.0']], support:4
pattern:[['14903.0']], support:4
pattern:[[

pattern:[['14830.0']], support:8
pattern:[['22701.0']], support:8
pattern:[['14054.0']], support:8
pattern:[['10152.0']], support:8
pattern:[['14710.0']], support:8
pattern:[['30321.0']], support:8
pattern:[['25110.0']], support:8
pattern:[['24403.0']], support:8
pattern:[['14020.0']], support:8
pattern:[['14513.0']], support:8
pattern:[['14132.0']], support:8
pattern:[['14090.0']], support:8
pattern:[['23112.0']], support:9
pattern:[['14905.0']], support:9
pattern:[['10422.0']], support:9
pattern:[['22171.0']], support:9
pattern:[['24011.0']], support:9
pattern:[['14073.0']], support:9
pattern:[['15235.0']], support:9
pattern:[['22007.0']], support:9
pattern:[['14121.0']], support:10
pattern:[['15503.0']], support:10
pattern:[['14052.0']], support:10
pattern:[['14060.0']], support:10
pattern:[['11220.0']], support:10
pattern:[['10001.0']], support:10
pattern:[['27240.0']], support:10
pattern:[['25100.0']], support:10
pattern:[['15113.0']], support:10
pattern:[['15502.0']], support:10



min_support = 2: 
pattern:[['14600.0']], support:2
pattern:[['20101.0']], support:2
pattern:[['20101.0', '20110.0']], support:2
pattern:[['14644.0']], support:2
pattern:[['14926.0']], support:2
pattern:[['34150.0']], support:2
pattern:[['10400.0']], support:2
pattern:[['15100.0']], support:2
pattern:[['15100.0', '22102.0']], support:2
pattern:[['15100.0', '22102.0', '30380.0']], support:2
pattern:[['15100.0', '30380.0']], support:2
pattern:[['15100.0', '15200.0']], support:2
pattern:[['15100.0', '15200.0', '22102.0']], support:2
pattern:[['15100.0', '15200.0', '22102.0', '30380.0']], support:2
pattern:[['15100.0', '15200.0', '30380.0']], support:2
pattern:[['11121.0']], support:2
pattern:[['30803.0']], support:2
pattern:[['14113.0']], support:2
pattern:[['14141.0']], support:2
pattern:[['14862.0']], support:2
pattern:[['14862.0', '30380.0']], support:2
pattern:[['15005.0']], support:2
pattern:[['22715.0']], support:2
pattern:[['20202.0']], support:2
pattern:[['14133.0']], support:2
pa

pattern:[['14241.0', '30380.0']], support:7
pattern:[['14800.0']], support:18
pattern:[['14800.0', '14802.0']], support:2
pattern:[['14800.0', '14802.0', '30380.0']], support:2
pattern:[['14800.0', '22701.0']], support:2
pattern:[['14800.0', '25101.0']], support:2
pattern:[['14800.0', '22172.0']], support:2
pattern:[['14800.0', '22172.0', '25120.0']], support:2
pattern:[['14800.0', '22172.0', '25120.0', '30380.0']], support:2
pattern:[['14800.0', '22172.0', '30380.0']], support:2
pattern:[['14800.0', '15114.0']], support:3
pattern:[['14800.0', '15114.0', '25120.0']], support:2
pattern:[['14800.0', '15114.0', '25120.0', '27410.0']], support:2
pattern:[['14800.0', '15114.0', '27410.0']], support:2
pattern:[['14800.0', '15114.0', '30380.0']], support:2
pattern:[['14800.0', '25120.0']], support:3
pattern:[['14800.0', '25120.0', '27410.0']], support:2
pattern:[['14800.0', '25120.0', '30380.0']], support:2
pattern:[['14800.0', '22102.0']], support:3
pattern:[['14800.0', '22102.0', '24010.0']

pattern:[['15111.0', '27410.0']], support:5
pattern:[['15111.0', '27410.0', '30380.0']], support:5
pattern:[['15111.0', '15119.0']], support:9
pattern:[['15111.0', '15119.0', '15202.0']], support:2
pattern:[['15111.0', '15119.0', '30380.0']], support:3
pattern:[['15111.0', '15130.0']], support:10
pattern:[['15111.0', '15130.0', '22007.0']], support:2
pattern:[['15111.0', '15130.0', '22007.0', '30380.0']], support:2
pattern:[['15111.0', '15130.0', '22008.0']], support:2
pattern:[['15111.0', '15130.0', '22008.0', '27410.0']], support:2
pattern:[['15111.0', '15130.0', '22008.0', '27410.0', '30380.0']], support:2
pattern:[['15111.0', '15130.0', '22008.0', '30380.0']], support:2
pattern:[['15111.0', '15130.0', '27410.0']], support:4
pattern:[['15111.0', '15130.0', '27410.0', '30380.0']], support:4
pattern:[['15111.0', '15130.0', '30380.0']], support:8
pattern:[['15111.0', '30380.0']], support:16
pattern:[['22170.0']], support:35
pattern:[['22170.0', '27300.0']], support:2
pattern:[['22170.0

pattern:[['22101.0', '22111.0', '24101.0']], support:2
pattern:[['22101.0', '22111.0', '24101.0', '30380.0']], support:2
pattern:[['22101.0', '22111.0', '23132.0']], support:2
pattern:[['22101.0', '22111.0', '22132.0']], support:2
pattern:[['22101.0', '22111.0', '22132.0', '30380.0']], support:2
pattern:[['22101.0', '22111.0', '27100.0']], support:2
pattern:[['22101.0', '22111.0', '27300.0']], support:3
pattern:[['22101.0', '22111.0', '27300.0', '30380.0']], support:3
pattern:[['22101.0', '22111.0', '22172.0']], support:3
pattern:[['22101.0', '22111.0', '22172.0', '27300.0']], support:2
pattern:[['22101.0', '22111.0', '22172.0', '27300.0', '30380.0']], support:2
pattern:[['22101.0', '22111.0', '22172.0', '30380.0']], support:3
pattern:[['22101.0', '22111.0', '22130.0']], support:4
pattern:[['22101.0', '22111.0', '22130.0', '30380.0']], support:3
pattern:[['22101.0', '22111.0', '30380.0']], support:9
pattern:[['22101.0', '30380.0']], support:18
pattern:[['22101.0', '22102.0']], support:

pattern:[['15115.0', '22172.0']], support:2
pattern:[['15115.0', '25120.0']], support:3
pattern:[['15115.0', '25120.0', '27000.0']], support:2
pattern:[['15115.0', '25120.0', '30380.0']], support:2
pattern:[['15115.0', '27300.0']], support:3
pattern:[['15115.0', '27300.0', '30380.0']], support:3
pattern:[['15115.0', '15140.0']], support:3
pattern:[['15115.0', '15140.0', '30380.0']], support:2
pattern:[['15115.0', '32416.0']], support:3
pattern:[['15115.0', '22102.0']], support:3
pattern:[['15115.0', '22102.0', '30380.0']], support:3
pattern:[['15115.0', '15202.0']], support:3
pattern:[['15115.0', '15202.0', '22008.0']], support:2
pattern:[['15115.0', '15202.0', '22008.0', '30380.0']], support:2
pattern:[['15115.0', '15202.0', '30380.0']], support:2
pattern:[['15115.0', '22020.0']], support:3
pattern:[['15115.0', '22020.0', '30380.0']], support:2
pattern:[['15115.0', '15232.0']], support:3
pattern:[['15115.0', '25111.0']], support:3
pattern:[['15115.0', '25111.0', '30380.0']], support:2

pattern:[['15110.0', '22008.0', '22102.0']], support:4
pattern:[['15110.0', '22008.0', '22102.0', '22111.0']], support:2
pattern:[['15110.0', '22008.0', '22102.0', '22111.0', '23132.0']], support:2
pattern:[['15110.0', '22008.0', '22102.0', '23132.0']], support:2
pattern:[['15110.0', '22008.0', '22102.0', '30380.0']], support:2
pattern:[['15110.0', '22008.0', '30380.0']], support:8
pattern:[['15110.0', '15115.0']], support:20
pattern:[['15110.0', '15115.0', '25120.0']], support:2
pattern:[['15110.0', '15115.0', '25120.0', '27000.0']], support:2
pattern:[['15110.0', '15115.0', '22000.0']], support:2
pattern:[['15110.0', '15115.0', '22000.0', '30380.0']], support:2
pattern:[['15110.0', '15115.0', '15119.0']], support:2
pattern:[['15110.0', '15115.0', '22008.0']], support:2
pattern:[['15110.0', '15115.0', '22008.0', '27410.0']], support:2
pattern:[['15110.0', '15115.0', '22036.0']], support:2
pattern:[['15110.0', '15115.0', '22036.0', '30380.0']], support:2
pattern:[['15110.0', '15115.0',

pattern:[['14850.0']], support:4
pattern:[['15410.0']], support:4
pattern:[['30522.0']], support:4
pattern:[['14932.0']], support:4
pattern:[['14094.0']], support:4
pattern:[['14401.0']], support:4
pattern:[['14111.0']], support:4
pattern:[['14772.0']], support:4
pattern:[['15112.0']], support:4
pattern:[['22801.0']], support:4
pattern:[['11103.0']], support:4
pattern:[['15419.0']], support:4
pattern:[['30348.0']], support:4
pattern:[['25110.0']], support:4
pattern:[['11111.0']], support:4
pattern:[['14280.0']], support:4
pattern:[['24111.0']], support:4
pattern:[['21000.0']], support:4
pattern:[['34220.0']], support:4
pattern:[['27210.0']], support:4
pattern:[['34204.0']], support:4
pattern:[['15131.0']], support:4
pattern:[['14281.0']], support:4
pattern:[['10103.0']], support:4
pattern:[['14924.0']], support:4
pattern:[['14914.0']], support:4
pattern:[['14912.0']], support:4
pattern:[['21200.0']], support:4
pattern:[['14510.0']], support:4
pattern:[['14631.0']], support:4
pattern:[[

pattern:[['14917.0']], support:8
pattern:[['14074.0']], support:8
pattern:[['14552.0']], support:8
pattern:[['11222.0']], support:8
pattern:[['24401.0']], support:8
pattern:[['14043.0']], support:8
pattern:[['15459.0']], support:8
pattern:[['14531.0']], support:8
pattern:[['14843.0']], support:8
pattern:[['10135.0']], support:8
pattern:[['14839.0']], support:8
pattern:[['14075.0']], support:8
pattern:[['14593.0']], support:8
pattern:[['11221.0']], support:8
pattern:[['30365.0']], support:8
pattern:[['11104.0']], support:8
pattern:[['34121.0']], support:8
pattern:[['14261.0']], support:8
pattern:[['11522.0']], support:8
pattern:[['11200.0']], support:9
pattern:[['14407.0']], support:9
pattern:[['14801.0']], support:9
pattern:[['22110.0']], support:9
pattern:[['10136.0']], support:9
pattern:[['14596.0']], support:9
pattern:[['15605.0']], support:9
pattern:[['14956.0']], support:9
pattern:[['14072.0']], support:9
pattern:[['14521.0']], support:9
pattern:[['14100.0']], support:9
pattern:[[

pattern:[['15235.0']], support:16
pattern:[['40000.0']], support:16
pattern:[['14052.0']], support:17
pattern:[['10116.0']], support:17
pattern:[['11057.0']], support:17
pattern:[['14073.0']], support:17
pattern:[['14033.0']], support:17
pattern:[['22702.0']], support:17
pattern:[['15440.0']], support:17
pattern:[['22610.0']], support:18
pattern:[['14594.0']], support:18
pattern:[['14812.0']], support:18
pattern:[['14013.0']], support:18
pattern:[['22190.0']], support:18
pattern:[['14241.0']], support:18
pattern:[['14800.0']], support:18
pattern:[['15139.0']], support:18
pattern:[['14403.0']], support:19
pattern:[['11056.0']], support:19
pattern:[['14054.0']], support:19
pattern:[['22671.0']], support:19
pattern:[['20121.0']], support:19
pattern:[['25121.0']], support:19
pattern:[['15402.0']], support:19
pattern:[['10140.0']], support:20
pattern:[['14815.0']], support:20
pattern:[['15116.0']], support:20
pattern:[['10152.0']], support:20
pattern:[['22171.0']], support:21
pattern:[['142

pattern:[['24101.0']], support:67
pattern:[['14091.0']], support:67
pattern:[['22130.0']], support:67
pattern:[['14092.0']], support:71
pattern:[['25111.0']], support:71
pattern:[['27400.0']], support:80
pattern:[['15200.0']], support:81
pattern:[['15202.0']], support:84
pattern:[['22103.0']], support:102
pattern:[['27200.0']], support:105
pattern:[['25120.0']], support:109
pattern:[['23132.0']], support:112
pattern:[['22036.0']], support:113
pattern:[['15115.0']], support:114
pattern:[['10141.0']], support:116
pattern:[['22008.0']], support:117
pattern:[['15114.0']], support:119
pattern:[['24010.0']], support:121
pattern:[['22111.0']], support:122
pattern:[['27002.0']], support:129
pattern:[['25101.0']], support:135
pattern:[['27300.0']], support:155
pattern:[['15130.0']], support:168
pattern:[['15130.0', '30380.0']], support:73
pattern:[['22102.0']], support:215
pattern:[['22102.0', '30380.0']], support:71
pattern:[['27410.0']], support:223
pattern:[['27410.0', '30380.0']], support:6