In [1]:
# PrefixSpan

import sys
import pandas as pd
import numpy as np

PLACE_HOLDER = '_'

def input():

    # file input
    data = pd.read_csv('trade.csv', usecols=['uid', 'vipno', 'sldat', 'pluno'])
    data['timestamp'] = pd.to_datetime(data['sldat'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldat', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['pluno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['pluno'])

    return data_set.fillna(0)

def input_new():

    # file input
    data = pd.read_csv('trade_new.csv', usecols=['uid', 'vipno', 'sldatime', 'pluno'])
    data['timestamp'] = pd.to_datetime(data['sldatime'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldatime', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['pluno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['pluno'])

    return data_set.fillna(0)

def createInitSet(data_set):  

    # clear all 0
    data_array = data_set.as_matrix()
    data_dok = []
    for row in data_array:
        s = []
        s.append([str(x) for x in row if x != 0.0])
        data_dok.append(s)

    return data_dok


In [2]:
class SquencePattern:
    def __init__(self, squence, support):
        self.squence = []
        for s in squence:
            self.squence.append(list(s))
        self.support = support

    def append(self, p):
        if p.squence[0][0] == PLACE_HOLDER:
            first_e = p.squence[0]
            first_e.remove(PLACE_HOLDER)
            self.squence[-1].extend(first_e)
            self.squence.extend(p.squence[1:])
        else:
            self.squence.extend(p.squence)
        self.support = min(self.support, p.support)


def prefixSpan(pattern, S, threshold):
    patterns = []
    f_list = frequent_items(S, pattern, threshold)
	
    for i in f_list:
        p = SquencePattern(pattern.squence, pattern.support)
        p.append(i)
        patterns.append(p)
        
        
        p_S = build_projected_database(S, p)
        p_patterns = prefixSpan(p, p_S, threshold)
        patterns.extend(p_patterns)
    return patterns


def frequent_items(S, pattern, threshold):
    items = {}
    _items = {}
    f_list = []
    if S is None or len(S) == 0:
        return []

    if len(pattern.squence) != 0:
        last_e = pattern.squence[-1]
    else:
        last_e = []
    for s in S:
        #class 1
        is_prefix = True
        for item in last_e:
            if item not in s[0]:
                is_prefix = False
                break
        if is_prefix and len(last_e) > 0:
            index = s[0].index(last_e[-1])
            if index < len(s[0]) - 1:
                for item in s[0][index + 1:]:
                    if item in _items:
                        _items[item] += 1
                    else:
                        _items[item] = 1

        #class 2
        if PLACE_HOLDER in s[0]:
            for item in s[0][1:]:
                if item in _items:
                    _items[item] += 1
                else:
                    _items[item] = 1
            s = s[1:]

        #class 3
        counted = []
        for element in s:
            for item in element:
                if item not in counted:
                    counted.append(item)
                    if item in items:
                        items[item] += 1
                    else:
                        items[item] = 1

    f_list.extend([SquencePattern([[PLACE_HOLDER, k]], v)
                    for k, v in _items.iteritems()
                    if v >= threshold])
    f_list.extend([SquencePattern([[k]], v)
                   for k, v in items.iteritems()
                   if v >= threshold])
    sorted_list = sorted(f_list, key=lambda p: p.support)
    return sorted_list  
    


def build_projected_database(S, pattern):
    """
    suppose S is projected database base on pattern's prefix,
    so we only need to use the last element in pattern to
    build projected database
    """
    p_S = []
    last_e = pattern.squence[-1]
    last_item = last_e[-1]
    for s in S:
        p_s = []
        for element in s:
            is_prefix = False
            if PLACE_HOLDER in element:
                if last_item in element and len(pattern.squence[-1]) > 1:
                    is_prefix = True
            else:
                is_prefix = True
                for item in last_e:
                    if item not in element:
                        is_prefix = False
                        break

            if is_prefix:
                e_index = s.index(element)
                i_index = element.index(last_item)
                if i_index == len(element) - 1:
                    p_s = s[e_index + 1:]
                else:
                    p_s = s[e_index:]
                    index = element.index(last_item)
                    e = element[i_index:]
                    e[0] = PLACE_HOLDER
                    p_s[0] = e
                break
        if len(p_s) != 0:
            p_S.append(p_s)

    return p_S


def print_patterns(patterns):
    for p in patterns:
        print("pattern:{0}, support:{1}".format(p.squence, p.support))



In [3]:
if __name__ == "__main__":
    
    S = createInitSet(input())

    for min_support in [2,4,8,16,32,64]:
        print "\nmin_support = %d: "%min_support
        patterns = prefixSpan(SquencePattern([], sys.maxint), S, min_support)
        print_patterns(patterns)

    S = createInitSet(input_new())

    for min_support in [2,4,8,16,32,64]:
        print "\nmin_support = %d: "%min_support
        patterns = prefixSpan(SquencePattern([], sys.maxint), S, min_support)
        print_patterns(patterns)


min_support = 2: 
pattern:[['11521012.0']], support:2
pattern:[['10113005.0']], support:2
pattern:[['10113005.0', '27000582.0']], support:2
pattern:[['27400118.0']], support:2
pattern:[['14091007.0']], support:2
pattern:[['14091009.0']], support:2
pattern:[['14081006.0']], support:2
pattern:[['14839016.0']], support:2
pattern:[['27002559.0']], support:2
pattern:[['22040157.0']], support:2
pattern:[['14132170.0']], support:2
pattern:[['14200040.0']], support:2
pattern:[['14200040.0', '30380002.0']], support:2
pattern:[['10501076.0']], support:2
pattern:[['15202020.0']], support:2
pattern:[['15209007.0']], support:2
pattern:[['14822002.0']], support:2
pattern:[['14847018.0']], support:2
pattern:[['14831007.0']], support:2
pattern:[['14831007.0', '25120036.0']], support:2
pattern:[['14831007.0', '14900007.0']], support:2
pattern:[['14831007.0', '14900007.0', '25120036.0']], support:2
pattern:[['20110014.0']], support:2
pattern:[['14015066.0']], support:2
pattern:[['14092042.0']], support

pattern:[['27300273.0', '30380003.0']], support:2
pattern:[['27300273.0', '27410007.0']], support:2
pattern:[['27300273.0', '30380002.0']], support:3
pattern:[['27300273.0', '27410000.0']], support:3
pattern:[['27300273.0', '27410005.0']], support:5
pattern:[['25111048.0']], support:60
pattern:[['25111048.0', '27300274.0']], support:2
pattern:[['25111048.0', '27100542.0']], support:2
pattern:[['25111048.0', '25120016.0']], support:2
pattern:[['25111048.0', '30380001.0']], support:2
pattern:[['25111048.0', '27200170.0']], support:3
pattern:[['25111048.0', '27200170.0', '30380002.0']], support:3
pattern:[['25111048.0', '27000573.0']], support:3
pattern:[['25111048.0', '27000573.0', '30380003.0']], support:2
pattern:[['25111048.0', '27410003.0']], support:3
pattern:[['25111048.0', '27410003.0', '30380002.0']], support:2
pattern:[['25111048.0', '27000576.0']], support:3
pattern:[['25111048.0', '27410001.0']], support:3
pattern:[['25111048.0', '27410001.0', '30380003.0']], support:2
pattern

pattern:[['23113019.0']], support:4
pattern:[['22103003.0']], support:4
pattern:[['24011018.0']], support:4
pattern:[['23112032.0']], support:4
pattern:[['40000700.0']], support:4
pattern:[['14750041.0']], support:4
pattern:[['22103008.0']], support:4
pattern:[['15113004.0']], support:4
pattern:[['22005000.0']], support:4
pattern:[['15503022.0']], support:4
pattern:[['22604000.0']], support:4
pattern:[['22630009.0']], support:4
pattern:[['14020005.0']], support:4
pattern:[['27002440.0']], support:4
pattern:[['22120005.0']], support:4
pattern:[['14802055.0']], support:4
pattern:[['27410007.0']], support:4
pattern:[['11110050.0']], support:4
pattern:[['14091028.0']], support:4
pattern:[['30323001.0']], support:4
pattern:[['10130010.0']], support:4
pattern:[['22172008.0']], support:4
pattern:[['10150004.0']], support:4
pattern:[['10130006.0']], support:4
pattern:[['15115000.0']], support:4
pattern:[['15022003.0']], support:4
pattern:[['22840002.0']], support:4
pattern:[['22111006.0']], su

pattern:[['14860017.0']], support:16
pattern:[['27410001.0']], support:16
pattern:[['27002555.0']], support:16
pattern:[['23113027.0']], support:16
pattern:[['22601000.0']], support:16
pattern:[['22030005.0']], support:16
pattern:[['30380001.0']], support:17
pattern:[['25101011.0']], support:17
pattern:[['22002239.0']], support:17
pattern:[['23134003.0']], support:18
pattern:[['22100010.0']], support:18
pattern:[['27400855.0']], support:18
pattern:[['22103005.0']], support:18
pattern:[['15114015.0']], support:18
pattern:[['15110001.0']], support:20
pattern:[['24101006.0']], support:21
pattern:[['15200007.0']], support:21
pattern:[['25101046.0']], support:22
pattern:[['27000573.0']], support:22
pattern:[['15110071.0']], support:22
pattern:[['23132061.0']], support:23
pattern:[['15130006.0']], support:23
pattern:[['22111004.0']], support:27
pattern:[['27100542.0']], support:29
pattern:[['23131002.0']], support:29
pattern:[['22500022.0']], support:29
pattern:[['25120036.0']], support:30
p

pattern:[['10300019.0', '15115030.0']], support:2
pattern:[['10300019.0', '15115030.0', '15130035.0']], support:2
pattern:[['10300019.0', '30380002.0']], support:2
pattern:[['10300019.0', '30380003.0']], support:2
pattern:[['10300019.0', '15130035.0']], support:2
pattern:[['24000388.0']], support:5
pattern:[['24000388.0', '24011096.0']], support:2
pattern:[['15115030.0']], support:6
pattern:[['15115030.0', '27410000.0']], support:2
pattern:[['15115030.0', '15119007.0']], support:2
pattern:[['15115030.0', '15119007.0', '30380002.0']], support:2
pattern:[['15115030.0', '15130035.0']], support:3
pattern:[['15115030.0', '15130035.0', '30380002.0']], support:2
pattern:[['15115030.0', '15130035.0', '27410000.0']], support:2
pattern:[['15115030.0', '15115031.0']], support:4
pattern:[['15115030.0', '15115031.0', '15130035.0']], support:2
pattern:[['15115030.0', '15115031.0', '15130035.0', '30380002.0']], support:2
pattern:[['15115030.0', '15115031.0', '15119007.0']], support:2
pattern:[['15115

pattern:[['22008021.0']], support:30
pattern:[['22008021.0', '22102014.0']], support:2
pattern:[['22008021.0', '22034000.0']], support:2
pattern:[['22008021.0', '27410003.0']], support:2
pattern:[['22008021.0', '27410003.0', '30380003.0']], support:2
pattern:[['22008021.0', '22034004.0']], support:2
pattern:[['22008021.0', '27002555.0']], support:2
pattern:[['22008021.0', '22021260.0']], support:2
pattern:[['22008021.0', '23110009.0']], support:3
pattern:[['22008021.0', '22036000.0']], support:4
pattern:[['22008021.0', '22036000.0', '30380003.0']], support:3
pattern:[['22008021.0', '30380002.0']], support:4
pattern:[['22008021.0', '30380003.0']], support:10
pattern:[['22701014.0']], support:30
pattern:[['22701014.0', '27400855.0']], support:2
pattern:[['22701014.0', '25101044.0']], support:2
pattern:[['22701014.0', '27410000.0']], support:2
pattern:[['22701014.0', '27410000.0', '30380002.0']], support:2
pattern:[['22701014.0', '27300274.0']], support:2
pattern:[['22701014.0', '27100542

pattern:[['22701003.0']], support:4
pattern:[['14092014.0']], support:4
pattern:[['22040157.0']], support:4
pattern:[['14822000.0']], support:4
pattern:[['22701009.0']], support:4
pattern:[['10152012.0']], support:4
pattern:[['14083004.0']], support:4
pattern:[['15110020.0']], support:4
pattern:[['15110010.0']], support:4
pattern:[['15130007.0']], support:4
pattern:[['14900008.0']], support:4
pattern:[['14000010.0']], support:4
pattern:[['14750057.0']], support:4
pattern:[['14091073.0']], support:4
pattern:[['14092067.0']], support:4
pattern:[['14092067.0', '30380002.0']], support:4
pattern:[['10119147.0']], support:4
pattern:[['20130000.0']], support:4
pattern:[['11300130.0']], support:4
pattern:[['22130018.0']], support:4
pattern:[['27000037.0']], support:4
pattern:[['22005006.0']], support:4
pattern:[['14073041.0']], support:4
pattern:[['15503022.0']], support:4
pattern:[['10310051.0']], support:4
pattern:[['11500027.0']], support:4
pattern:[['22021005.0']], support:4
pattern:[['103

pattern:[['14831007.0']], support:8
pattern:[['10141025.0']], support:8
pattern:[['22120005.0']], support:8
pattern:[['14130054.0']], support:8
pattern:[['22013022.0']], support:8
pattern:[['11056085.0']], support:8
pattern:[['27400708.0']], support:8
pattern:[['14013011.0']], support:8
pattern:[['15231006.0']], support:8
pattern:[['27002440.0']], support:8
pattern:[['14082017.0']], support:8
pattern:[['22008011.0']], support:8
pattern:[['27002560.0']], support:8
pattern:[['15115001.0']], support:8
pattern:[['14000001.0']], support:8
pattern:[['22130011.0']], support:8
pattern:[['22630010.0']], support:8
pattern:[['15130013.0']], support:8
pattern:[['15116001.0']], support:8
pattern:[['15110030.0']], support:9
pattern:[['22001001.0']], support:9
pattern:[['25121003.0']], support:9
pattern:[['10150004.0']], support:9
pattern:[['15115028.0']], support:9
pattern:[['15113000.0']], support:9
pattern:[['15110033.0']], support:9
pattern:[['11110004.0']], support:9
pattern:[['22030008.0']], su

pattern:[['27410002.0']], support:21
pattern:[['27200938.0']], support:21
pattern:[['22100000.0']], support:21
pattern:[['15202044.0']], support:21
pattern:[['22101000.0']], support:22
pattern:[['15115003.0']], support:22
pattern:[['22002239.0']], support:22
pattern:[['22020006.0']], support:23
pattern:[['27400855.0']], support:23
pattern:[['15232001.0']], support:23
pattern:[['15130009.0']], support:24
pattern:[['22111012.0']], support:25
pattern:[['22030011.0']], support:25
pattern:[['22103001.0']], support:26
pattern:[['24101006.0']], support:26
pattern:[['22103005.0']], support:26
pattern:[['15130035.0']], support:26
pattern:[['22008019.0']], support:26
pattern:[['27000574.0']], support:27
pattern:[['22102000.0']], support:27
pattern:[['22002240.0']], support:27
pattern:[['27410044.0']], support:27
pattern:[['21021043.0']], support:28
pattern:[['30380001.0']], support:28
pattern:[['22008020.0']], support:28
pattern:[['15114015.0']], support:29
pattern:[['22008021.0']], support:30
p