In [1]:
from pandas import *
import numpy as np
from datetime import datetime, timedelta

In [2]:
# helpers
def format_purchase_data(coupon_detail_train):
    # data structure
    user_purchase = {}
    for idx in coupon_detail_train.index:
        user_id = coupon_detail_train.ix[idx, 'USER_ID_hash']
        coupon_id = coupon_detail_train.ix[idx, 'COUPON_ID_hash']

        if user_purchase.get(user_id) is None:
            user_purchase[user_id] = []
        user_purchase[user_id].append(coupon_id)
    return user_purchase

def get_accuracy(pred, actual):
    '''
    pred and actual should be in a nested dict format
    '''
    n = 0
    c = 0
    for k, v_pred in pred.iteritems():
        n += 1
        v_actual = actual.get(k)
        if v_actual is None:
            continue
        elif set(v_actual) == set(v_pred):
            c += 1
        else:
            continue
#         if n > 0 and (n % 1000) == 0:
#             print n
    acc = float(c) / n
    print 'Total predictions: %d' % n
    print 'Correct predictions: %d' % c
    print 'Accuracy: %.4f' % acc
    return acc

In [3]:
# load data
# users <- read.csv('user_list.csv')
users = read_csv('user_list.csv', parse_dates=[0])

# lists
lists_train = read_csv('coupon_list_train.csv')
lists_test = read_csv('coupon_list_test.csv')

# coupon detail
coupon_detail_train = read_csv('coupon_detail_train.csv')
coupon_detail_train_train = coupon_detail_train[coupon_detail_train.I_DATE < '2012-06-16']
coupon_detail_train_test = coupon_detail_train[coupon_detail_train.I_DATE >= '2012-06-16']

# formatted in nested dict structure
coupon_detail_train_train_fmt = format_purchase_data(coupon_detail_train_train)
coupon_detail_train_test_fmt = format_purchase_data(coupon_detail_train_test)

# area
area_train = read_csv('coupon_area_train.csv')
area_test = read_csv('coupon_area_test.csv')

# visit
visit_train = read_csv('coupon_visit_train.csv')

In [4]:
# normalize features of coupons
num_vars = [
    'PRICE_RATE',
    'CATALOG_PRICE',
    'DISPPERIOD',
    'VALIDPERIOD',
]
bin_vars = [
    'USABLE_DATE_MON',
    'USABLE_DATE_TUE',
    'USABLE_DATE_WED',
    'USABLE_DATE_THU',
    'USABLE_DATE_FRI',
    'USABLE_DATE_SAT',
    'USABLE_DATE_SUN',
    'USABLE_DATE_HOLIDAY',
    'USABLE_DATE_BEFORE_HOLIDAY',
]
for var in num_vars:
    lists_train[var + '_norm'] = lists_train[var].astype(float) / np.std(lists_train[var])
    
fit_vars = bin_vars + [x + '_norm' for x in num_vars]

lists_train.ix[:, fit_vars] = lists_train.ix[:, fit_vars].fillna(0)

In [5]:
# calculate distance
def _mod(x):
    return np.power(np.power(x, 2).sum(), .5)

def cosine(x, y):
    x = np.array(x)
    y = np.array(y)
    
    return (x * y).sum() / ( _mod(x) * _mod(y) )
    
    
def calc_sim(x, y, sim_func=cosine):
    '''
    x, y: vector of numeric features
    '''
    sim = sim_func(x, y)
    
    return sim

def get_top_items(x, X, topn=None, sim_func=cosine):
    if topn is None:
        topn = len(X)-1
    
    sim_arr = []
    err_arr = []
    for _x in X:
        _x = _x[:-1]  # remove the last element (ID)
        sim = calc_sim(x, _x, sim_func=sim_func)
        if np.isnan(sim):
            err_arr.append(_x)
        else:
            sim_arr.append(sim)
    
    ret = []
    for i in np.argsort(sim_arr)[-topn:]:
        ret.append((X[i, -1], sim_arr[i]))
    return ret

In [6]:
calc_sim(lists_train.ix[0, fit_vars], lists_train.ix[1, fit_vars])

0.96263281306158266

In [12]:
idx = 100
a = get_top_items(lists_train.ix[idx, fit_vars], lists_train.drop(idx).ix[:, fit_vars + ['COUPON_ID_hash']].values, topn=10)
a

[('4fe8384eaa70c88b673d33ac0e842f45', 0.99248322354488394),
 ('50c1741fe466fb183cdefe63cf3ee8f3', 0.99248322354488394),
 ('d1e91d549b5262d9acd8cb3d3f7c8405', 0.99302897810978308),
 ('d2f902c5482df9bcee095929fca87c86', 0.99477506731066645),
 ('1c00074b1310d491cb80a4edeb2019fe', 0.99673905652384898),
 ('8296fc321680ccfca77284d20bc1251e', 0.99821537425465701),
 ('f4b8c261bf98436f067e9d0f90172d25', 0.99830098111535792),
 ('61ad0253ea426a710219be23b3dc4ac6', 0.99869526288058608),
 ('5facc2c8bcb02e93e59c81685d8ea81c', 0.99990196609775461),
 ('1ede1e9d563ca676c356af05911686fc', 0.99993289886369718)]