In [214]:
from pandas import *
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix

In [102]:
# helpers
def trans_sparse_matrix(coupon_detail_train, users, lists):
    '''
    get sparse matrix using coupon as row and user as column
    '''
    nrow = coupon_detail_train.shape[0]
    
    # hash user_id and coupon_id
    users_hashtable = {x: i for i, x in enumerate(users.USER_ID_hash)}
    coupon_hashtable = {x: i for i, x in enumerate(lists_train.COUPON_ID_hash)}
    
    row = coupon_detail_train.COUPON_ID_hash.apply(lambda x: coupon_hashtable.get(x))
    column = coupon_detail_train.USER_ID_hash.apply(lambda x: users_hashtable.get(x))
    
    mat = coo_matrix((np.array([1.] * nrow), (row, column)), shape=(len(coupon_hashtable), len(users)))
    mat = mat.tocsr()
    return mat

def get_accuracy(pred, actual):
    '''
    pred and actual should be in a nested dict format
    '''
    n = 0
    c = 0
    for k, v_pred in pred.iteritems():
        n += 1
        v_actual = actual.get(k)
        if v_actual is None:
            continue
        elif set(v_actual) == set(v_pred):
            c += 1
        else:
            continue
    acc = float(c) / n
    print 'Total predictions: %d' % n
    print 'Correct predictions: %d' % c
    print 'Accuracy: %.4f' % acc
    return acc

# calculate distance
def _mod(x):
    return np.power(np.power(x, 2).sum(), .5)

def cosine(x, y):
    x = np.array(x)
    y = np.array(y)
    
    return (x * y).sum() / ( _mod(x) * _mod(y) )
    
def calc_sim(x, y, sim_func=cosine):
    '''
    x, y: vector of numeric features
    '''
    sim = sim_func(x, y)
    
    return sim

def get_top_items(x, X, topn=None, sim_func=cosine):
    if topn is None:
        topn = len(X)-1
    
    sim_arr = []
    err_arr = []
    for _x in X:
        _x = _x[:-1]  # remove the last element (ID)
        sim = calc_sim(x, _x, sim_func=sim_func)
        if np.isnan(sim):
            err_arr.append(_x)
        else:
            sim_arr.append(sim)
    
    ret = []
    for i in np.argsort(sim_arr)[-topn:]:
        ret.append((X[i, -1], sim_arr[i]))
    return ret

def predict(f_test, F_train, U_train, topn=10, threshold=0.5):
    # similarity of test coupon C_i and train coupon C_j
    S = np.array([calc_sim(f_test, f_train) for f_train in F_train])
    
    # take top n most similar train coupons
    T_idx = np.argsort(S)[::-topn]
    
    # calculate sum of user vector
    U = U_train[T_idx].sum(axis=0) / topn
    
    return U
        

In [3]:
# load data
# users <- read.csv('user_list.csv')
users = read_csv('user_list.csv', parse_dates=[0])

# lists
lists_train = read_csv('coupon_list_train.csv')
lists_test = read_csv('coupon_list_test.csv')

# coupon detail
coupon_detail_train = read_csv('coupon_detail_train.csv')

# area
area_train = read_csv('coupon_area_train.csv')
area_test = read_csv('coupon_area_test.csv')

# visit
visit_train = read_csv('coupon_visit_train.csv')

In [4]:
# normalize features of coupons
num_vars = [
    'PRICE_RATE',
    'CATALOG_PRICE',
    'DISPPERIOD',
    'VALIDPERIOD',
]
bin_vars = [
    'USABLE_DATE_MON',
    'USABLE_DATE_TUE',
    'USABLE_DATE_WED',
    'USABLE_DATE_THU',
    'USABLE_DATE_FRI',
    'USABLE_DATE_SAT',
    'USABLE_DATE_SUN',
    'USABLE_DATE_HOLIDAY',
    'USABLE_DATE_BEFORE_HOLIDAY',
]
for var in num_vars:
    lists_train[var + '_norm'] = lists_train[var].astype(float) / np.std(lists_train[var])
    lists_test[var + '_norm'] = lists_test[var].astype(float) / np.std(lists_test[var])
    
fit_vars = bin_vars + [x + '_norm' for x in num_vars]

lists_train.ix[:, fit_vars] = lists_train.ix[:, fit_vars].fillna(0)
lists_test.ix[:, fit_vars] = lists_test.ix[:, fit_vars].fillna(0)

### Training the data

In [131]:
Y = trans_sparse_matrix(coupon_detail_train, users, lists_train)
U = Y
F = lists_train.ix[:, fit_vars].values

U_pred = []
for i in lists_test.index:
    f_test = lists_test.ix[i, fit_vars].values
    u_pred = predict(f_test, F, U).A[0]
    
    U_pred.append(u_pred)
    
    if i > 0 and (i % 10) == 0:
        print i


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300


In [195]:
OUTPUT = (np.array(U_pred).T > 0.5) * 1
OUTPUT = np.array(np.where(OUTPUT)).T
pred = {}
for row in OUTPUT:
    user_id, coupon_id = users.USER_ID_hash[row[1]], lists_test.COUPON_ID_hash[row[0]]
    if pred.get(user_id) is None:
        pred[user_id] = []
    pred[user_id].append(coupon_id)
    
with open('solution.csv', 'wb') as f:
    f.write('USER_ID_hash,PURCHASED_COUPONS\n')
    for user_id in users.USER_ID_hash.tolist():
        coupon_ids = pred.get(user_id)
        f.write(user_id + ',')
        f.write(','.join(coupon_ids) if coupon_ids is not None else '' + '\n')