In [1]:
# PrefixSpan

import sys
import pandas as pd
import numpy as np

PLACE_HOLDER = '_'

def input():

    # file input
    data = pd.read_csv('trade.csv', usecols=['uid', 'vipno', 'sldat', 'bndno'])
    data['timestamp'] = pd.to_datetime(data['sldat'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldat', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['bndno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['bndno'])

    return data_set.fillna(0)

def input_new():

    # file input
    data = pd.read_csv('trade_new.csv', usecols=['uid', 'vipno', 'sldatime', 'bndno'])
    data['timestamp'] = pd.to_datetime(data['sldatime'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldatime', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['bndno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['bndno'])

    return data_set.fillna(0)

def createInitSet(data_set):  

    # clear all 0
    data_array = data_set.as_matrix()
    data_dok = []
    for row in data_array:
        s = []
        s.append([str(x) for x in row if x != 0.0])
        data_dok.append(s)

    return data_dok

In [2]:
class SquencePattern:
    def __init__(self, squence, support):
        self.squence = []
        for s in squence:
            self.squence.append(list(s))
        self.support = support

    def append(self, p):
        if p.squence[0][0] == PLACE_HOLDER:
            first_e = p.squence[0]
            first_e.remove(PLACE_HOLDER)
            self.squence[-1].extend(first_e)
            self.squence.extend(p.squence[1:])
        else:
            self.squence.extend(p.squence)
        self.support = min(self.support, p.support)


def prefixSpan(pattern, S, threshold):
    patterns = []
    f_list = frequent_items(S, pattern, threshold)
	
    for i in f_list:
        p = SquencePattern(pattern.squence, pattern.support)
        p.append(i)
        patterns.append(p)
        
        
        p_S = build_projected_database(S, p)
        p_patterns = prefixSpan(p, p_S, threshold)
        patterns.extend(p_patterns)
    return patterns


def frequent_items(S, pattern, threshold):
    items = {}
    _items = {}
    f_list = []
    if S is None or len(S) == 0:
        return []

    if len(pattern.squence) != 0:
        last_e = pattern.squence[-1]
    else:
        last_e = []
    for s in S:
        #class 1
        is_prefix = True
        for item in last_e:
            if item not in s[0]:
                is_prefix = False
                break
        if is_prefix and len(last_e) > 0:
            index = s[0].index(last_e[-1])
            if index < len(s[0]) - 1:
                for item in s[0][index + 1:]:
                    if item in _items:
                        _items[item] += 1
                    else:
                        _items[item] = 1

        #class 2
        if PLACE_HOLDER in s[0]:
            for item in s[0][1:]:
                if item in _items:
                    _items[item] += 1
                else:
                    _items[item] = 1
            s = s[1:]

        #class 3
        counted = []
        for element in s:
            for item in element:
                if item not in counted:
                    counted.append(item)
                    if item in items:
                        items[item] += 1
                    else:
                        items[item] = 1

    f_list.extend([SquencePattern([[PLACE_HOLDER, k]], v)
                    for k, v in _items.iteritems()
                    if v >= threshold])
    f_list.extend([SquencePattern([[k]], v)
                   for k, v in items.iteritems()
                   if v >= threshold])
    sorted_list = sorted(f_list, key=lambda p: p.support)
    return sorted_list  
    


def build_projected_database(S, pattern):
    """
    suppose S is projected database base on pattern's prefix,
    so we only need to use the last element in pattern to
    build projected database
    """
    p_S = []
    last_e = pattern.squence[-1]
    last_item = last_e[-1]
    for s in S:
        p_s = []
        for element in s:
            is_prefix = False
            if PLACE_HOLDER in element:
                if last_item in element and len(pattern.squence[-1]) > 1:
                    is_prefix = True
            else:
                is_prefix = True
                for item in last_e:
                    if item not in element:
                        is_prefix = False
                        break

            if is_prefix:
                e_index = s.index(element)
                i_index = element.index(last_item)
                if i_index == len(element) - 1:
                    p_s = s[e_index + 1:]
                else:
                    p_s = s[e_index:]
                    index = element.index(last_item)
                    e = element[i_index:]
                    e[0] = PLACE_HOLDER
                    p_s[0] = e
                break
        if len(p_s) != 0:
            p_S.append(p_s)

    return p_S


def print_patterns(patterns):
    for p in patterns:
        print("pattern:{0}, support:{1}".format(p.squence, p.support))



In [3]:
if __name__ == "__main__":
    
    S = createInitSet(input())

    for min_support in [2,4,8,16,32,64]:
        print "\nmin_support = %d: "%min_support
        patterns = prefixSpan(SquencePattern([], sys.maxint), S, min_support)
        print_patterns(patterns)

    S = createInitSet(input_new())

    for min_support in [2,4,8,16,32,64]:
        print "\nmin_support = %d: "%min_support
        patterns = prefixSpan(SquencePattern([], sys.maxint), S, min_support)
        print_patterns(patterns)


min_support = 2: 
pattern:[['11008.0']], support:2
pattern:[['34060.0']], support:2
pattern:[['14830.0']], support:2
pattern:[['14556.0']], support:2
pattern:[['34039.0']], support:2
pattern:[['14834.0']], support:2
pattern:[['14558.0']], support:2
pattern:[['14445.0']], support:2
pattern:[['14167.0']], support:2
pattern:[['14121.0']], support:2
pattern:[['14076.0']], support:2
pattern:[['11131.0']], support:2
pattern:[['11119.0']], support:2
pattern:[['10005.0']], support:2
pattern:[['14123.0']], support:2
pattern:[['14123.0', '14362.0']], support:2
pattern:[['11327.0']], support:2
pattern:[['10154.0']], support:2
pattern:[['14246.0']], support:2
pattern:[['30727.0']], support:2
pattern:[['14010.0']], support:2
pattern:[['14010.0', '30248.0']], support:2
pattern:[['14703.0']], support:2
pattern:[['14703.0', '14753.0']], support:2
pattern:[['14020.0']], support:2
pattern:[['14736.0']], support:2
pattern:[['14784.0']], support:2
pattern:[['14784.0', '15094.0']], support:2
pattern:[['14

pattern:[['14838.0']], support:4
pattern:[['11048.0']], support:4
pattern:[['11204.0']], support:4
pattern:[['10198.0']], support:4
pattern:[['14805.0']], support:4
pattern:[['14319.0']], support:4
pattern:[['14258.0']], support:4
pattern:[['11224.0']], support:4
pattern:[['14357.0']], support:4
pattern:[['34214.0']], support:4
pattern:[['14345.0']], support:4
pattern:[['10199.0']], support:4
pattern:[['15026.0']], support:4
pattern:[['15028.0']], support:4
pattern:[['10710.0']], support:4
pattern:[['14224.0']], support:4
pattern:[['14208.0']], support:4
pattern:[['15078.0']], support:4
pattern:[['14126.0']], support:4
pattern:[['14041.0']], support:4
pattern:[['11344.0']], support:4
pattern:[['14341.0']], support:4
pattern:[['14318.0']], support:4
pattern:[['14035.0']], support:4
pattern:[['14005.0']], support:4
pattern:[['15073.0']], support:4
pattern:[['14268.0']], support:4
pattern:[['14350.0']], support:4
pattern:[['30192.0']], support:4
pattern:[['14393.0']], support:4
pattern:[[


min_support = 2: 
pattern:[['10679.0']], support:2
pattern:[['14179.0']], support:2
pattern:[['14179.0', '30248.0']], support:2
pattern:[['14307.0']], support:2
pattern:[['30750.0']], support:2
pattern:[['30049.0']], support:2
pattern:[['30059.0']], support:2
pattern:[['14826.0']], support:2
pattern:[['10198.0']], support:2
pattern:[['11345.0']], support:2
pattern:[['14070.0']], support:2
pattern:[['14746.0']], support:2
pattern:[['14518.0']], support:2
pattern:[['14518.0', '30248.0']], support:2
pattern:[['14117.0']], support:2
pattern:[['14420.0']], support:2
pattern:[['14321.0']], support:2
pattern:[['14770.0']], support:2
pattern:[['14770.0', '30248.0']], support:2
pattern:[['14726.0']], support:2
pattern:[['11165.0']], support:2
pattern:[['14728.0']], support:2
pattern:[['11260.0']], support:2
pattern:[['15649.0']], support:2
pattern:[['10120.0']], support:2
pattern:[['14724.0']], support:2
pattern:[['14724.0', '30248.0']], support:2
pattern:[['14724.0', '15012.0']], support:2
pa

pattern:[['14281.0', '30170.0']], support:2
pattern:[['14281.0', '15590.0']], support:2
pattern:[['14281.0', '15038.0']], support:2
pattern:[['14281.0', '14362.0']], support:2
pattern:[['14281.0', '14362.0', '30248.0']], support:2
pattern:[['14281.0', '14362.0', '15094.0']], support:2
pattern:[['14281.0', '14362.0', '15094.0', '30248.0']], support:2
pattern:[['14281.0', '14475.0']], support:3
pattern:[['14281.0', '14475.0', '30248.0']], support:3
pattern:[['14281.0', '14838.0']], support:3
pattern:[['14281.0', '14838.0', '15094.0']], support:2
pattern:[['14281.0', '14838.0', '15094.0', '30248.0']], support:2
pattern:[['14281.0', '14838.0', '30248.0']], support:2
pattern:[['14281.0', '15631.0']], support:3
pattern:[['14281.0', '15052.0']], support:4
pattern:[['14281.0', '15052.0', '30248.0']], support:3
pattern:[['14281.0', '15012.0']], support:5
pattern:[['14281.0', '15012.0', '30248.0']], support:2
pattern:[['14281.0', '15012.0', '15094.0']], support:3
pattern:[['14281.0', '15092.0']]

pattern:[['15012.0', '30192.0']], support:2
pattern:[['15012.0', '30833.0']], support:2
pattern:[['15012.0', '15063.0']], support:2
pattern:[['15012.0', '15045.0']], support:3
pattern:[['15012.0', '15066.0']], support:4
pattern:[['15012.0', '15066.0', '30248.0']], support:2
pattern:[['15012.0', '15067.0']], support:4
pattern:[['15012.0', '15067.0', '30248.0']], support:3
pattern:[['15012.0', '15026.0']], support:4
pattern:[['15012.0', '15038.0']], support:4
pattern:[['15012.0', '30170.0']], support:5
pattern:[['15012.0', '30170.0', '30248.0']], support:4
pattern:[['15012.0', '15092.0']], support:5
pattern:[['15012.0', '15092.0', '30248.0']], support:3
pattern:[['15012.0', '15590.0']], support:6
pattern:[['15012.0', '15590.0', '30248.0']], support:5
pattern:[['15012.0', '15631.0']], support:10
pattern:[['15012.0', '15052.0']], support:14
pattern:[['15012.0', '15052.0', '15094.0']], support:4
pattern:[['15012.0', '15052.0', '30248.0']], support:6
pattern:[['15012.0', '15039.0']], support

pattern:[['14167.0']], support:8
pattern:[['14121.0']], support:8
pattern:[['11094.0']], support:8
pattern:[['11327.0']], support:8
pattern:[['14752.0']], support:8
pattern:[['14110.0']], support:8
pattern:[['11106.0']], support:8
pattern:[['14067.0']], support:8
pattern:[['14444.0']], support:8
pattern:[['10161.0']], support:8
pattern:[['14245.0']], support:8
pattern:[['15632.0']], support:8
pattern:[['14395.0']], support:8
pattern:[['14652.0']], support:8
pattern:[['14145.0']], support:9
pattern:[['14445.0']], support:9
pattern:[['11123.0']], support:9
pattern:[['11224.0']], support:9
pattern:[['14357.0']], support:9
pattern:[['14443.0']], support:9
pattern:[['10199.0']], support:9
pattern:[['15074.0']], support:9
pattern:[['10652.0']], support:9
pattern:[['14651.0']], support:9
pattern:[['31027.0']], support:9
pattern:[['14102.0']], support:9
pattern:[['14152.0']], support:9
pattern:[['14440.0']], support:9
pattern:[['14440.0', '30248.0']], support:8
pattern:[['14071.0']], support:9