In [35]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

## create enroll matrix (train)

In [36]:
# Total records
train_df = pd.read_csv('./train/enroll_train.csv')

In [37]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,389,1124,61ysO1Pg5Q6bsn73gkzoQt72GbhUHgNd,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,0,4
1,437,1248,T6EWQ7jVP89JlHzj41KN1gEtGYevpSQM,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,1,1
2,563,1585,k3YRNSEUtBfnCzCI2ffYAi1qqoCc9d8W,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,2,1
3,619,1736,UaSh8dwx8gnofYbXlmpqj9uYDbpJ89pX,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,3,1
4,628,1758,yZ9FndAKqVsKwZtO1B2JJJblefdbhZ02,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,4,4


In [38]:
users = train_df.user.unique()
users[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [39]:
courses = train_df.course.unique()
courses.sort()
print(courses)
# type(courses), courses[:10]

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


In [40]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

Unnamed: 0,user
0,0
1,1
2,2
3,3
4,4


In [41]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

(20, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [42]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((train_df.user == u) & (train_df.course == c)).any():
            items_value[c][i] = 1

In [43]:
for col in items_columns:
    new_df[col] = items_value[col]

In [44]:
new_df.head()

Unnamed: 0,user,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,1,1,0,...,1,1,1,0,0,0,0,0,0,1
2,2,0,1,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
3,3,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0,0,0,0,1,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [46]:
new_df[0][:30]

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    1
11    0
12    1
13    1
14    1
15    1
16    1
17    0
18    1
19    0
20    0
21    1
22    0
23    1
24    1
25    1
26    1
27    0
28    1
29    0
Name: 0, dtype: int64

In [47]:
new_df.to_csv('enroll_matrix_train.csv', index=False)

## create enroll matrix (test)

In [48]:
# Total records
test_df = pd.read_csv('./test/enroll_test.csv')

In [49]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,225,1974,viuKt4GgrX0rYL5EpHoxJbsiC9TblSuk,81UZtt1JJwBFYMj5u38WNKCSVA4IJSDv,8,0
1,235,2092,5IM8g7YMOYo5qH7Yu3x31T3JkVOH9iqe,81UZtt1JJwBFYMj5u38WNKCSVA4IJSDv,62,0
2,240,2129,zeUi2UnQZPPF6bRQXw2s8LZ41ZOdqRAa,81UZtt1JJwBFYMj5u38WNKCSVA4IJSDv,27,0
3,301,2559,HbDaABEakQmE8DQEoFoIphu2SH13rFRy,5Gyp41oLVo7Gg7vF4vpmggWP5MU70QO6,16,12
4,329,2843,g3902agYIRHzac78MKVZlvljLMXXNpQ7,5Gyp41oLVo7Gg7vF4vpmggWP5MU70QO6,70,12


In [50]:
users = test_df.user.unique()
users[:10]

array([  8,  62,  27,  16,  70,  19,  65,  17,  40, 142])

In [51]:
courses = test_df.course.unique()
courses.sort()
courses[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [52]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

Unnamed: 0,user
0,8
1,62
2,27
3,16
4,70


In [53]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

(20, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [54]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((test_df.user == u) & (test_df.course == c)).any():
            items_value[c][i] = 1

In [55]:
for col in items_columns:
    new_df[col] = items_value[col]

In [56]:
new_df.head()

Unnamed: 0,user,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,8,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,62,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,27,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,16,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,70,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [59]:
new_df[0][:30]

0     1
1     1
2     1
3     0
4     0
5     0
6     1
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: 0, dtype: int64

In [60]:
new_df.to_csv('enroll_matrix_test.csv', index=False)