In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

## create enroll matrix (train)

In [2]:
# Total records
train_df = pd.read_csv('./train/enroll_train.csv')

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,61413,170410,03LSJMJGCEPYScMXgYv3wZ9A4zZCWROs,V4tXq15GxHo2gaMpaJLZ3IGEkP949IbE,user0,course0
1,27413,76019,03LSJMJGCEPYScMXgYv3wZ9A4zZCWROs,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,user0,course1
2,27412,76018,03LSJMJGCEPYScMXgYv3wZ9A4zZCWROs,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,user0,course2
3,45684,126800,03LSJMJGCEPYScMXgYv3wZ9A4zZCWROs,fbPkOYLVPtPgIt0MxizjfFJov3JbHyAi,user0,course3
4,27410,76015,03LSJMJGCEPYScMXgYv3wZ9A4zZCWROs,SpATywNh6bZuzm8s1ceuBUnMUAeoAHHw,user0,course4


In [4]:
users = train_df.user.unique()
users[:10]

array(['user0', 'user1', 'user2', 'user3', 'user4', 'user5', 'user6',
       'user7', 'user8', 'user9'], dtype=object)

In [5]:
courses = train_df.course.unique()
courses[:10]

array(['course0', 'course1', 'course2', 'course3', 'course4', 'course5',
       'course6', 'course7', 'course8', 'course9'], dtype=object)

In [6]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

Unnamed: 0,user
0,user0
1,user1
2,user2
3,user3
4,user4


In [7]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

(39,
 ['course0',
  'course1',
  'course2',
  'course3',
  'course4',
  'course5',
  'course6',
  'course7',
  'course8',
  'course9'])

In [8]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((train_df.user == u) & (train_df.course == c)).any():
            items_value[c][i] = 1

In [9]:
for col in items_columns:
    new_df[col] = items_value[col]

In [10]:
new_df.head()

Unnamed: 0,user,course0,course1,course2,course3,course4,course5,course6,course7,course8,...,course29,course30,course31,course32,course33,course34,course35,course36,course37,course38
0,user0,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,user1,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,user2,0,0,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,user3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,user4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
new_df['course0'][:30]

0     1
1     0
2     0
3     0
4     0
5     0
6     1
7     0
8     0
9     0
10    0
11    1
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    1
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: course0, dtype: int64

In [12]:
new_df.to_csv('enroll_matrix_train.csv', index=False)

## create enroll matrix (test)

In [13]:
# Total records
test_df = pd.read_csv('./test/enroll_test.csv')

In [14]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,25,193,tp9GLoYyljDV6mILQxrtrv1Iw4qcPl5Z,KHPw0gmg1Ad3V07TqRpyBzA8mRjj7mkt,user655,course11
1,38,334,uQjl8NGtIQkRfoe9edBiMgtYYmdkzVXR,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,user660,course2
2,75,677,DBl87nF6WTgoCD6kgTHVE2oB45pDEn6y,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,user160,course2
3,168,1424,VBP5PzVQEKJYFH0y5ffEek5WfP9EW7uc,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,user388,course1
4,173,1491,lCyf3TRKW0i3K1mBgKz23OONMdCWqNsI,7GRhBDsirIGkRZBtSMEzNTyDr2JQm4xx,user561,course29


In [15]:
users = test_df.user.unique()
users[:10]

array(['user655', 'user660', 'user160', 'user388', 'user561', 'user168',
       'user137', 'user672', 'user63', 'user718'], dtype=object)

In [16]:
courses = test_df.course.unique()
courses[:10]

array(['course11', 'course2', 'course1', 'course29', 'course22',
       'course23', 'course24', 'course3', 'course30', 'course37'],
      dtype=object)

In [17]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

Unnamed: 0,user
0,user655
1,user660
2,user160
3,user388
4,user561


In [18]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

(39,
 ['course11',
  'course2',
  'course1',
  'course29',
  'course22',
  'course23',
  'course24',
  'course3',
  'course30',
  'course37'])

In [19]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((train_df.user == u) & (train_df.course == c)).any():
            items_value[c][i] = 1

In [20]:
for col in items_columns:
    new_df[col] = items_value[col]

In [21]:
new_df.head()

Unnamed: 0,user,course11,course2,course1,course29,course22,course23,course24,course3,course30,...,course12,course19,course17,course5,course34,course36,course33,course13,course16,course38
0,user655,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,user660,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,user160,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,1
3,user388,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,user561,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [22]:
new_df['course0'][:30]

0     0
1     1
2     0
3     1
4     0
5     1
6     0
7     0
8     0
9     0
10    0
11    1
12    0
13    0
14    1
15    0
16    0
17    0
18    0
19    1
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: course0, dtype: int64

In [23]:
new_df.to_csv('enroll_matrix_test.csv', index=False)