Multi-Stack Ensemble
===

In [3]:
from operator import itemgetter
import itertools
import numpy as np
import pandas as pd
import imp
mt = imp.load_source('metrics', '../evaluation_metric/metrics.py')

In [4]:
split_char=","
sep_char="\t"

test_set_path = '../../Dataset/test_users.csv'
test_set = pd.read_csv(test_set_path, sep=sep_char,index_col=0)
test_set = test_set['items'].apply(lambda x: pd.Series([int(i) for i in str(x).split(split_char)] if x!='' else []))

In [5]:
idf = pd.read_csv("results/rank_averaging.csv", sep=sep_char,index_col=0)
idf = idf.fillna('')
idf.columns = ['items']
idf = idf.sort_index()
idf = idf['items'].apply(lambda x: pd.Series([int(i) for i in str(x).split(split_char)] if x!='' else []))
score = mt.challenge_score(idf, test_set, 30)

print score

113649.378005


Parser
---
Parse the submissions setting the value assigned to each item to 0.

In [46]:
def parse(sc, path):
    # Create rdd
    rdd = sc.textFile(path)
    # Select header to be removed
    header = rdd.first()

    rdd = (rdd.filter(lambda x: x != header)
           .map(lambda x: x.split("\t"))
           .filter(lambda x: x[0] != '')
           .map(lambda x:(int(x[0]),setup_items(x[1])))
    )

    return rdd

def setup_items(items):
    item_rank = 0
    items_list = []
    items_split = items.split(',')
    
    for item in items_split:
        if item != '':
            item_rank += 1
            items_list.append((int(item),item_rank,0))
            
    return items_list

In [48]:
#interaction_rdd = parse(sc, "./submissions/")
#impression_rdd = parse(sc, "./submissions/")

cf_intint_rdd = parse(sc, "../../Submissions/ens_subs/cf_int_int_18.9k.csv")
cf_intimp_rdd = parse(sc, "../../Submissions/ens_subs/cf_int_imp_22.4k.csv")
cf_impint_rdd = parse(sc, "../../Submissions/ens_subs/imp_imp_int_int_33.2k_avg.csv")
cf_impimp_rdd = parse(sc, "../../Submissions/ens_subs/cf_imp_imp_31k.csv")
cf_i_intint_rdd = parse(sc, "../../Submissions/ens_subs/true_10k_CF_ITEM_greater_3_shr=1_knn=750_22k.csv")
cf_i_impimp_rdd = parse(sc, "../../Submissions/ens_subs/cf_item_imp_imp_20k.csv")

cb_idf_rdd = parse(sc, "../../Submissions/ens_subs/idf_45k.csv")
cb_concept_rdd = parse(sc, "../../Submissions/ens_subs/tf_ervin_37k.csv")

#baseline_rdd = parse(sc, "./submissions/")

Scorer
===
Assigns to each item a value based on the chosen algorithm.

In [49]:
def linear_scorer(sc, rdd, score, decay):
    """Assign a value using a linear algorithm.
    score is the value given to that learner."""
    
    scored_rdd = rdd.map(lambda x: (x[0], linear_calculator(x[1], score, decay)))
    return scored_rdd
    
def linear_calculator(tuples, score, decay):
    scored_tuples = []
    
    for t in tuples:
        scored_tuples.append((t[0], t[1], score - t[1]*decay))
    
    return scored_tuples

def evaluation_scorer(sc, rdd, score):
    """Assign a value based on the score obtained on the online leaderboard and the evaluation metric."""
    
    number_of_items = calculate_number_of_items(sc, rdd)
    scored_rdd = rdd.map(lambda x: (x[0], evaluation_calculator(x[1], score, number_of_items)))
    
    return scored_rdd

def calculate_number_of_items(sc, rdd):
    return sum(rdd.filter(lambda x: x != header)
                .map(lambda x: x.split("\t"))
                .filter(lambda x: x[0] != '')
                .map(lambda x: len(get_items(x[1]))).collect())
    
def evaluation_calculator(tuples, score, number_of_items):
    scored_tuples = []
    
    for t in tuples:
        scored_tuples.append((t[0], t[1], score * 1000 / number_of_items))
        
    return scored_tuples

Ensemble
==

In [50]:
def ensemble(sc, rdd_list):
    combination_rdd = sc.emptyRDD()
    
    for rdd in rdd_list:
        combination_rdd = combination_rdd.union(rdd.map(lambda x: (x[0], [(i[0],i[2]) for i in x[1]])))
        
    ensemble_rdd = (combination_rdd.flatMap(lambda r: [(r[0], x) for x in r[1]])
                 .map(lambda x: ((x[0],x[1][0]),x[1][1]))
                 .reduceByKey(lambda x,y:x+y)
                 .map(lambda x: (x[0][0],[(x[0][1],x[1])]))
                 .reduceByKey(lambda x,y:x+y)
                 .map(lambda x: (x[0],sorted(x[1], key=itemgetter(1), reverse=True)[:30]))
                 )
        
    return ensemble_rdd

In [51]:
def create_linear_combination(values_pool, decays_pool, number_of_elements):
    combination = []
    
    for v in range(0,len(values_pool)):
        for d in range(0, len(decays_pool)):
            combination.append((values_pool[v], decays_pool[d]))
        
    return itertools.combinations(combination, number_of_elements)

Stack Ensembler - Level 1
===
Collaborative Filtering
---

In [54]:
a = ensemble(sc, [
        linear_scorer(sc, test_rdd, 2, 0.001),
        linear_scorer(sc, test_rdd, 2, 0.001)
    ])

In [55]:
print a.take(1)

[(458752, [(2403671, 3.998), (893826, 3.996), (2597393, 3.994), (532879, 3.992), (453182, 3.99), (415805, 3.988), (1047607, 3.986), (2175903, 3.984), (2405192, 3.982), (1379165, 3.98), (1183681, 3.978), (1239344, 3.976), (2048576, 3.974), (2800343, 3.972), (410886, 3.97), (1934282, 3.968), (219673, 3.966), (1244205, 3.964), (1260758, 3.962), (2019239, 3.96), (2458094, 3.958), (312640, 3.956), (471663, 3.954), (1384457, 3.952), (998555, 3.95), (1709395, 3.948), (359374, 3.946), (2360067, 3.944), (1487482, 3.942), (1703136, 3.94)])]


In [58]:
values_pool = [0, 1]
decays_pool = [0.001,0.0015]

# The third parameters is the number of submissions that we have in this level of the stack
combinations = create_linear_combination(values_pool, decays_pool, 2)
for combination in combinations:
    print combination

((0, 0.001), (0, 0.0015))
((0, 0.001), (1, 0.001))
((0, 0.001), (1, 0.0015))
((0, 0.0015), (1, 0.001))
((0, 0.0015), (1, 0.0015))
((1, 0.001), (1, 0.0015))


In [None]:
# 
#  LINEAR
#

searched_list = []

values_pool = [0,1,2]
decays_pool = [0.001,0.0015,0.002]

# The third parameters is the number of submissions that we have in this level of the stack
combinations = create_linear_combination(values_pool, decays_pool, 6)

for combination in combinations:
    lin_stack_1_1 = ensemble(sc, [
            linear_scorer(sc, cf_intint_rdd, combination[0][0], combination[0][1]),
            linear_scorer(sc, cf_intimp_rdd, combination[1][0], combination[1][1]),
            linear_scorer(sc, cf_impint_rdd, combination[2][0], combination[2][1]),
            linear_scorer(sc, cf_impimp_rdd, combination[3][0], combination[3][1]),
            linear_scorer(sc, cf_i_intint_rdd, combination[4][0], combination[4][1]),
            linear_scorer(sc, cf_i_impimp_rdd, combination[5][0], combination[5][1])
        ])
    
    # Evaluate the ensemble
    lin_score = 
    searched_list.append((lin_score, combination, lin_stack_1_1))

for s in sorted(searched_list, key=itemgetter(x[0]), reverse=True):
    print str(s) + " ;"

In [None]:
# 
#  EVALUATION SCORE
#

eval_stack_1_1 = ensemble(sc, [
            evaluation_scorer(cf_intint_rdd, ),
            evaluation_scorer(cf_intimp_rdd, ),
            evaluation_scorer(cf_impint_rdd, ),
            evaluation_scorer(cf_impimp_rdd, ),
            evaluation_scorer(cf_i_intint_rdd, ),
            evaluation_scorer(cf_i_impimp_rdd, )
        ])
    
# Evaluate the ensemble
eval_score = 
print "Evaluation score stack lv 1.1: " + str(eval_score)

In [None]:
best_stack_1_1 = sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][2] \
                if max(sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][0], eval_score) != eval_score \
                else eval_stack_1_1

Content-Based
---

In [None]:
# 
#  LINEAR
#

searched_list = []

values_pool = [0,1]
decays_pool = [0.001,0.0015,0.002]

# The third parameters is the number of submissions that we have in this level of the stack
combinations = create_linear_combination(values_pool, decays_pool, 2)

for combination in combinations:
    lin_stack_1_2 = ensemble(sc, [
            linear_scorer(sc, cb_idf_rdd, combination[0][0], combination[0][1]),
            linear_scorer(sc, cb_concept_rdd, combination[1][0], combination[1][1])
        ])
    
    # Evaluate the ensemble
    lin_score = 
    searched_list.append((lin_score, combination, lin_stack_1_2))
    
for s in sorted(searched_list, key=itemgetter(x[0]), reverse=True):
    print str(s) + " ;"

In [None]:
# 
#  EVALUATION SCORE
#

eval_stack_1_2 = ensemble(sc, [
            evaluation_scorer(cb_idf_rdd, ),
            evaluation_scorer(cb_concept_rdd, )
        ])
    
# Evaluate the ensemble
eval_score = 
print "Evaluation score stack lv 1.2: " + str(eval_score)

In [None]:
best_stack_1_2 = sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][2] \
                if max(sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][0], eval_score) != eval_score \
                else eval_stack_1_2

Stack Ensembler - Level 2
===
Collaborative + Content
---

In [None]:
# 
#  LINEAR
#

searched_list = []

values_pool = [0,1]
decays_pool = [0.001,0.0015,0.002]

# The third parameters is the number of submissions that we have in this level of the stack
combinations = create_linear_combination(values_pool, decays_pool, 2)

for combination in combinations:
    lin_stack_2 = ensemble(sc, [
            linear_scorer(sc, best_stack_1_1, combination[0][0], combination[0][1]),
            linear_scorer(sc, best_stack_1_2, combination[1][0], combination[1][1])
        ])
    
    # Evaluate the ensemble
    lin_score = 
    searched_list.append((lin_score, combination, lin_stack_2))

In [None]:
# 
#  EVALUATION SCORE
#

stack_2 = ensemble(sc, [
            evaluation_scorer(best_stack_1_1, ),
            evaluation_scorer(best_stack_1_2, )
        ])
    
# Evaluate the ensemble
score = 
print "Evaluation score stack lv 2: " + str(score)

In [None]:
best_stack_2 = sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][2] \
                if max(sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][0], eval_score) != eval_score \
                else eval_stack_2

Stack Ensembler - Level 3
===
Int + Imp + Level_2 + Baseline
---

In [None]:
# 
#  LINEAR
#

searched_list = []

values_pool = [0,1,2]
decays_pool = [0.001,0.0015,0.002]

# The third parameters is the number of submissions that we have in this level of the stack
combinations = create_linear_combination(values_pool, decays_pool, 4)

for combination in combinations:
    lin_stack_3 = ensemble(sc, [
            linear_scorer(sc, interactions_rdd, combination[0][0], combination[0][1]),
            linear_scorer(sc, impressions_rdd, combination[1][0], combination[1][1])
            linear_scorer(sc, best_stack_2, combination[2][0], combination[2][1])
            linear_scorer(sc, baseline_rdd, combination[3][0], combination[3][1])
        ])
    
    # Evaluate the ensemble
    lin_score = 
    searched_list.append((lin_score, combination, lin_stack_3))

In [None]:
# 
#  EVALUATION SCORE
#

stack_3 = ensemble(sc, [
            evaluation_scorer(interactions_rdd, ),
            evaluation_scorer(impressions_rdd, ),
            evaluation_scorer(best_stack_2, ),
            evaluation_scorer(baseline_rdd, )
        ])
    
# Evaluate the ensemble
score = 
print "Evaluation score stack lv 3: " + str(score)

In [None]:
best_stack_3 = sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][2] \
                if max(sorted(searched_list, key=itemgetter(x[0]), reverse=True)[0][0], eval_score) != eval_score \
                else eval_stack_3

Write Recommendation
---