In [1]:
import numpy as np
from collections import Counter

In [2]:
data = []

with open('train.txt', 'r') as f:
    for line in f:
        tokens = line.strip().split(' ')
        data.append(tokens)

In [3]:
print data[42]

['942', '691']


In [4]:
dataset = []

for order in data:
    for i in xrange(len(order)):
        dataset.append({
            'data': order[:i] + order[(i + 1):],
            'target': order[i]
        })

In [5]:
len(dataset)

240165

In [6]:
dataset[42]

{'data': ['580', '30', '804', '654', '25'], 'target': '743'}

In [7]:
universe = set()
for order in data:
    universe.update(order)

In [8]:
len(universe)

992

In [9]:
universe_list = list(universe)

In [10]:
def neg_sample(dataset_obj, neg_samples_count=5):
    samples = list(np.random.choice(universe_list, neg_samples_count))
    samples.append(dataset_obj['target'])
    res = dict(dataset_obj)
    res['canidates'] = samples
    return res

In [11]:
train_dataset, test_dataset = dataset[:200000], dataset[200000:]

In [12]:
dataset_for_learning = map(lambda o: neg_sample(o, 5), train_dataset)

In [13]:
dataset_for_testing = map(lambda o: neg_sample(o, 50), test_dataset)

In [14]:
dataset_for_learning[42]

{'canidates': ['233', '123', '934', '404', '898', '743'],
 'data': ['580', '30', '804', '654', '25'],
 'target': '743'}

In [15]:
dataset_for_testing[42]

{'canidates': ['237',
  '995',
  '127',
  '80',
  '431',
  '404',
  '329',
  '665',
  '57',
  '593',
  '73',
  '859',
  '569',
  '176',
  '401',
  '390',
  '807',
  '280',
  '932',
  '536',
  '314',
  '775',
  '387',
  '943',
  '993',
  '264',
  '596',
  '274',
  '599',
  '542',
  '146',
  '700',
  '221',
  '668',
  '284',
  '299',
  '683',
  '385',
  '442',
  '479',
  '381',
  '28',
  '778',
  '769',
  '205',
  '31',
  '405',
  '67',
  '8',
  '286',
  '508'],
 'data': ['377', '836', '256', '759', '254', '543'],
 'target': '508'}

In [16]:
with open('dataset_train', 'w') as f:
    for obj in dataset_for_learning:
        for candidate in obj['canidates']:
            f.write('{} |Data {} |Candidate {}\n'.format(
                int(candidate == obj['target']) * 2 - 1, 
                ' '.join(obj['data']),
                candidate
            ))

In [17]:
with open('dataset_test', 'w') as f:
    for obj in dataset_for_testing:
        for candidate in obj['canidates']:
            f.write('{} |Data {} |Candidate {}\n'.format(
                int(candidate == obj['target']) * 2 - 1, 
                ' '.join(obj['data']),
                candidate
            ))

In [18]:
!rm dataset_train.cache
!vw -d dataset_train  -c --passes 10 -f vw.model --binary --quiet

In [19]:
!vw -i vw.model -t dataset_test -p dataset_test.out --quiet

In [20]:
recommendations = []
with open('dataset_test.out', 'r') as f:
    for obj in dataset_for_testing:
        rec = []
        for candidate in obj['canidates']:
            rec.append((candidate, float(f.readline()), candidate == obj['target'])) 
        recommendations.append(sorted(rec, key=lambda x: -x[1]))

In [21]:
def estimate_quality(recommendations):
    recalls_at_k = np.zeros(100)
    count = 0
    for rec in recommendations:
        hitted = False
        count += 1
        for i, (candidate, prediction, true_relevance) in enumerate(rec):
            hitted |= true_relevance
            recalls_at_k[i] += hitted
    for pos, val in enumerate(recalls_at_k * 1. / count):
        print pos + 1, round(val, 3)

In [22]:
estimate_quality(recommendations)

1 0.063
2 0.116
3 0.164
4 0.207
5 0.247
6 0.284
7 0.32
8 0.353
9 0.384
10 0.413
11 0.441
12 0.466
13 0.493
14 0.516
15 0.54
16 0.563
17 0.586
18 0.608
19 0.629
20 0.649
21 0.668
22 0.686
23 0.705
24 0.723
25 0.74
26 0.757
27 0.772
28 0.788
29 0.804
30 0.818
31 0.832
32 0.844
33 0.857
34 0.87
35 0.882
36 0.893
37 0.904
38 0.915
39 0.927
40 0.937
41 0.947
42 0.956
43 0.965
44 0.974
45 0.98
46 0.986
47 0.991
48 0.994
49 0.997
50 0.999
51 1.0
52 0.0
53 0.0
54 0.0
55 0.0
56 0.0
57 0.0
58 0.0
59 0.0
60 0.0
61 0.0
62 0.0
63 0.0
64 0.0
65 0.0
66 0.0
67 0.0
68 0.0
69 0.0
70 0.0
71 0.0
72 0.0
73 0.0
74 0.0
75 0.0
76 0.0
77 0.0
78 0.0
79 0.0
80 0.0
81 0.0
82 0.0
83 0.0
84 0.0
85 0.0
86 0.0
87 0.0
88 0.0
89 0.0
90 0.0
91 0.0
92 0.0
93 0.0
94 0.0
95 0.0
96 0.0
97 0.0
98 0.0
99 0.0
100 0.0


In [23]:
!rm dataset_train.cache
!vw -d dataset_train  -c --passes 2 -f vw.model -q DC --quiet --binary

In [24]:
!vw -i vw.model -t dataset_test -p dataset_test.out --quiet

In [25]:
recommendations = []
with open('dataset_test.out', 'r') as f:
    for obj in dataset_for_testing:
        rec = []
        for candidate in obj['canidates']:
            rec.append((candidate, float(f.readline()), candidate == obj['target'])) 
        recommendations.append(sorted(rec, key=lambda x: -x[1]))

In [26]:
estimate_quality(recommendations)

1 0.158
2 0.231
3 0.282
4 0.325
5 0.363
6 0.395
7 0.425
8 0.452
9 0.478
10 0.504
11 0.527
12 0.549
13 0.571
14 0.591
15 0.612
16 0.63
17 0.647
18 0.664
19 0.679
20 0.695
21 0.71
22 0.723
23 0.738
24 0.751
25 0.763
26 0.774
27 0.784
28 0.793
29 0.802
30 0.809
31 0.816
32 0.823
33 0.829
34 0.835
35 0.84
36 0.843
37 0.848
38 0.851
39 0.854
40 0.856
41 0.858
42 0.859
43 0.86
44 0.861
45 0.862
46 0.863
47 0.863
48 0.863
49 0.864
50 0.864
51 1.0
52 0.0
53 0.0
54 0.0
55 0.0
56 0.0
57 0.0
58 0.0
59 0.0
60 0.0
61 0.0
62 0.0
63 0.0
64 0.0
65 0.0
66 0.0
67 0.0
68 0.0
69 0.0
70 0.0
71 0.0
72 0.0
73 0.0
74 0.0
75 0.0
76 0.0
77 0.0
78 0.0
79 0.0
80 0.0
81 0.0
82 0.0
83 0.0
84 0.0
85 0.0
86 0.0
87 0.0
88 0.0
89 0.0
90 0.0
91 0.0
92 0.0
93 0.0
94 0.0
95 0.0
96 0.0
97 0.0
98 0.0
99 0.0
100 0.0


In [27]:
!rm dataset_train.cache
!vw -d dataset_train  -c --passes 10 -f vw.model -q DC --quiet --binary

In [28]:
!vw -i vw.model -t dataset_test -p dataset_test.out --quiet

In [29]:
recommendations = []
with open('dataset_test.out', 'r') as f:
    for obj in dataset_for_testing:
        rec = []
        for candidate in obj['canidates']:
            rec.append((candidate, float(f.readline()), candidate == obj['target'])) 
        recommendations.append(sorted(rec, key=lambda x: -x[1]))

In [30]:
estimate_quality(recommendations)

1 0.144
2 0.216
3 0.265
4 0.308
5 0.345
6 0.378
7 0.408
8 0.436
9 0.462
10 0.487
11 0.511
12 0.532
13 0.554
14 0.573
15 0.593
16 0.612
17 0.63
18 0.646
19 0.662
20 0.676
21 0.689
22 0.702
23 0.713
24 0.723
25 0.732
26 0.741
27 0.749
28 0.756
29 0.762
30 0.768
31 0.773
32 0.778
33 0.781
34 0.785
35 0.787
36 0.789
37 0.791
38 0.793
39 0.794
40 0.795
41 0.796
42 0.796
43 0.797
44 0.798
45 0.798
46 0.798
47 0.799
48 0.799
49 0.8
50 0.8
51 1.0
52 0.0
53 0.0
54 0.0
55 0.0
56 0.0
57 0.0
58 0.0
59 0.0
60 0.0
61 0.0
62 0.0
63 0.0
64 0.0
65 0.0
66 0.0
67 0.0
68 0.0
69 0.0
70 0.0
71 0.0
72 0.0
73 0.0
74 0.0
75 0.0
76 0.0
77 0.0
78 0.0
79 0.0
80 0.0
81 0.0
82 0.0
83 0.0
84 0.0
85 0.0
86 0.0
87 0.0
88 0.0
89 0.0
90 0.0
91 0.0
92 0.0
93 0.0
94 0.0
95 0.0
96 0.0
97 0.0
98 0.0
99 0.0
100 0.0
