In [1]:
# -*- coding: utf-8 -*-

import sys
import pandas as pd
import numpy as np

PLACE_HOLDER = '_'

def input_train():

    # file input
    data = pd.read_csv('trade_new.csv', usecols=['uid', 'vipno', 'sldatime', 'pluno'])
    data['timestamp'] = pd.to_datetime(data['sldatime'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] <= (0.6 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldatime', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['pluno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['pluno'])

    # add timestamp and vipno
    data_extra = grouped.drop(['rank', 'sldatime', 'pluno'], axis=1).drop_duplicates('uid').set_index('uid')
    data_set = pd.concat([data_set,data_extra], axis=1, join='inner').reset_index(drop=True).sort_values(['vipno','timestamp']).drop(['timestamp'], axis=1)

    return data_set.fillna(0)

def input_test():

    # file input
    data = pd.read_csv('trade_new.csv', usecols=['uid', 'vipno', 'sldatime', 'pluno'])
    data['timestamp'] = pd.to_datetime(data['sldatime'])

    # sort 
    data.sort_values(['vipno','timestamp'],ascending=[1,1],inplace=True) 

    # make groups 
    data['rank'] = data['timestamp'].groupby(data['vipno']).rank(ascending=0,method='first')

    # take top 60% in every group
    grouped = data.groupby(['vipno'], as_index = True).apply(lambda x: x[x['rank'] > (0.4 * x['rank'].max())])

    # convert
    data_set = grouped.drop(['rank', 'timestamp', 'sldatime', 'vipno'], axis=1).reset_index(drop=True)

    # merge by uid
    data_set['value'] = data_set['pluno']
    data_set = data_set.pivot_table(data_set, index=['uid'], columns=['pluno'])

    # add timestamp and vipno
    data_extra = grouped.drop(['rank', 'sldatime', 'pluno'], axis=1).drop_duplicates('uid').set_index('uid')
    data_set = pd.concat([data_set,data_extra], axis=1, join='inner').reset_index(drop=True).sort_values(['vipno','timestamp']).drop(['timestamp'], axis=1)

    return data_set.fillna(0)

def createInitSet(data_set):  

    # merge by vipno
    data_dok = []
    last_vipno = 0
    s = []
    data_array = data_set.drop(['vipno'], axis=1).as_matrix()
    vipno_array = data_set['vipno'].as_matrix()

    for i in range(0, data_set.shape[0]):
        if last_vipno == 0:
            last_vipno = vipno_array[i]
        elif last_vipno == vipno_array[i]:
            s.append([str(x) for x in data_array[i] if x != 0.0])
        else:
            data_dok.append(s)
            s = []
            s.append([str(x) for x in data_array[i] if x != 0.0])
            last_vipno = vipno_array[i]

    return data_dok

In [2]:
class SquencePattern:
    # init
    def __init__(self, squence, support):
        self.squence = []
        for s in squence:
            self.squence.append(list(s))
        self.support = support

    # add
    def append(self, p):
        if p.squence[0][0] == PLACE_HOLDER:
            first_e = p.squence[0]
            first_e.remove(PLACE_HOLDER)
            self.squence[-1].extend(first_e)
            self.squence.extend(p.squence[1:])
        else:
            self.squence.extend(p.squence)
        self.support = min(self.support, p.support)


def prefixSpan(pattern, S, threshold):
    patterns = []
    f_list = frequent_items(S, pattern, threshold)
	
    for i in f_list:
        # make patterns array
        p = SquencePattern(pattern.squence, pattern.support)
        p.append(i)
        patterns.append(p)
        
        # build a 'db' for query
        p_S = build_projected_database(S, p)
        p_patterns = prefixSpan(p, p_S, threshold)
        # grow
        patterns.extend(p_patterns)

    return patterns


def frequent_items(S, pattern, threshold):
    items = {}
    _items = {}
    f_list = []
    if S is None or len(S) == 0:
        return []

    if len(pattern.squence) != 0:
        last_e = pattern.squence[-1]
    else:
        last_e = []
    for s in S:
        #class 1
        is_prefix = True
        for item in last_e:
            if item not in s[0]:
                is_prefix = False
                break
        if is_prefix and len(last_e) > 0:
            index = s[0].index(last_e[-1])
            if index < len(s[0]) - 1:
                for item in s[0][index + 1:]:
                    if item in _items:
                        _items[item] += 1
                    else:
                        _items[item] = 1

        #class 2
        if PLACE_HOLDER in s[0]:
            for item in s[0][1:]:
                if item in _items:
                    _items[item] += 1
                else:
                    _items[item] = 1
            s = s[1:]

        #class 3
        counted = []
        for element in s:
            for item in element:
                if item not in counted:
                    counted.append(item)
                    if item in items:
                        items[item] += 1
                    else:
                        items[item] = 1

    f_list.extend([SquencePattern([[PLACE_HOLDER, k]], v)
                    for k, v in _items.iteritems()
                    if v >= threshold])
    f_list.extend([SquencePattern([[k]], v)
                   for k, v in items.iteritems()
                   if v >= threshold])
    sorted_list = sorted(f_list, key=lambda p: p.support)
    return sorted_list  
    


def build_projected_database(S, pattern):
    """
    suppose S is projected database base on pattern's prefix,
    so we only need to use the last element in pattern to
    build projected database
    """
    p_S = []
    last_e = pattern.squence[-1]
    last_item = last_e[-1]
    for s in S:
        p_s = []
        for element in s:
            is_prefix = False
            if PLACE_HOLDER in element:
                if last_item in element and len(pattern.squence[-1]) > 1:
                    is_prefix = True
            else:
                is_prefix = True
                for item in last_e:
                    if item not in element:
                        is_prefix = False
                        break

            if is_prefix:
                e_index = s.index(element)
                i_index = element.index(last_item)
                if i_index == len(element) - 1:
                    p_s = s[e_index + 1:]
                else:
                    p_s = s[e_index:]
                    index = element.index(last_item)
                    e = element[i_index:]
                    e[0] = PLACE_HOLDER
                    p_s[0] = e
                break
        if len(p_s) != 0:
            p_S.append(p_s)

    return p_S


def print_patterns(patterns):
    for p in patterns:
        print("pattern:{0}, support:{1}".format(p.squence, p.support))



In [3]:
def predict(patterns, data):

    results = []

    # use FP which has a sequenece longer than 1
    pattern_array = []
    for p in patterns:
        if len(p.squence) > 1:
            pattern_array.append([p.squence, p.support])

    pattern_list = pd.DataFrame(pattern_array, columns=['squence','support']).sort_values('support', ascending=0)

    # use the latest bought item to match a best(?) pattern and record the result
    # it's slow with 2 'for's
    for row in data:
        latest =  row[-1]
        best_pattern = [] 

        for index, row in pattern_list.iterrows():
            # just take the first one to improve performance
            if ((row[0])[0])[0] in latest:
                best_pattern = row[0]
                break
        if best_pattern == []:
            best_pattern.append(latest)
            best_pattern.append(latest)
        results.append(best_pattern[1] if best_pattern != [] else [])

    return results

def validate(result, validate_set):

    accuracy = 0
    count = 0.0
    total = len(result)
    next_buy_list = []

    # check if result set has item in the validate_set
    for record in range(0, total):
        for item in result[record]:
            if item in (validate_set[record])[0] and result[record] != []:
                count += 1
                break

    accuracy = count/total
    return accuracy


In [4]:
if __name__ == "__main__":
    
    S = createInitSet(input_train())

    train_patterns = prefixSpan(SquencePattern([], sys.maxint), S, 4)

    results = predict(train_patterns, S)

    print "Predict result:\n"

    for result in results:
        print result 

    validate_set = createInitSet(input_test())

    print "Accuracy of the prediction:",  validate(results, validate_set)*100, "%"

Predict result:

['30380003.0']
['10450048.0']
['30380003.0']
['30380003.0']
['22034001.0']
['30380003.0']
['10401049.0', '14900005.0']
['30380003.0']
['15110001.0']
['30380003.0']
['30380003.0']
['10136007.0', '10200035.0', '14075007.0']
['30380003.0']
['22102005.0']
['30380003.0']
['30380003.0']
['30380003.0']
['30380003.0']
['14836013.0', '27300275.0']
['11104009.0', '14121043.0', '14830022.0', '14912001.0', '15115010.0', '15200008.0', '27410005.0']
['30380003.0']
['15200007.0']
['30380003.0']
['22036000.0']
['30380003.0']
['14080020.0', '14121010.0', '14836018.0']
['10136001.0', '10136010.0', '15140008.0']
['10008002.0', '10150004.0', '10151002.0', '10435042.0', '14092014.0', '14092015.0', '14092017.0', '14722015.0', '14733015.0', '15000024.0']
['30380003.0']
['30710041.0', '30710046.0']
['24000370.0']
['30380003.0']
['30380003.0']
['30380003.0']
['30380003.0']
['30380003.0']
['27000582.0']
['30380003.0']
['30380003.0']
['22008009.0', '22008022.0']
['30380003.0']
['30380003.0']
['3

Accuracy of the prediction: 11.3871635611 %
