In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

## create enroll matrix (train)

In [2]:
# Total records
train_df = pd.read_csv('./train/enroll_train.csv')

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,389,1124,61ysO1Pg5Q6bsn73gkzoQt72GbhUHgNd,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,user0,course4
1,437,1248,T6EWQ7jVP89JlHzj41KN1gEtGYevpSQM,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,user1,course1
2,628,1758,yZ9FndAKqVsKwZtO1B2JJJblefdbhZ02,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,user2,course4
3,710,1967,viuKt4GgrX0rYL5EpHoxJbsiC9TblSuk,5Gyp41oLVo7Gg7vF4vpmggWP5MU70QO6,user3,course12
4,766,2109,HAAyQYNYY3ULmqhNC56bxv5FSwyMVoqP,5Gyp41oLVo7Gg7vF4vpmggWP5MU70QO6,user4,course12


In [4]:
users = train_df.user.unique()
users[:10]

array(['user0', 'user1', 'user2', 'user3', 'user4', 'user5', 'user6',
       'user7', 'user8', 'user9'], dtype=object)

In [5]:
courses = train_df.course.unique()
courses[:10]

array(['course4', 'course1', 'course12', 'course0', 'course5', 'course11',
       'course10', 'course6', 'course7', 'course3'], dtype=object)

In [6]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

Unnamed: 0,user
0,user0
1,user1
2,user2
3,user3
4,user4


In [7]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

(13,
 ['course4',
  'course1',
  'course12',
  'course0',
  'course5',
  'course11',
  'course10',
  'course6',
  'course7',
  'course3'])

In [8]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((train_df.user == u) & (train_df.course == c)).any():
            items_value[c][i] = 1

In [9]:
for col in items_columns:
    new_df[col] = items_value[col]

In [10]:
new_df.head()

Unnamed: 0,user,course4,course1,course12,course0,course5,course11,course10,course6,course7,course3,course9,course2,course8
0,user0,1,0,0,0,0,0,1,1,1,1,0,0,0
1,user1,0,1,1,0,0,1,1,1,1,0,0,0,0
2,user2,1,0,0,0,1,0,1,0,0,0,1,0,1
3,user3,0,1,1,0,0,1,1,0,0,0,0,0,1
4,user4,1,1,1,0,1,0,0,1,0,0,0,0,0


In [11]:
new_df['course0'][:30]

0     0
1     0
2     0
3     0
4     0
5     1
6     0
7     1
8     0
9     0
10    0
11    1
12    1
13    0
14    1
15    1
16    0
17    1
18    0
19    0
20    0
21    1
22    0
23    0
24    0
25    1
26    0
27    1
28    0
29    0
Name: course0, dtype: int64

In [12]:
new_df.to_csv('enroll_matrix_train.csv', index=False)

## create enroll matrix (test)

In [46]:
# Total records
test_df = pd.read_csv('./test/enroll_test.csv')

In [47]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,225,1974,viuKt4GgrX0rYL5EpHoxJbsiC9TblSuk,81UZtt1JJwBFYMj5u38WNKCSVA4IJSDv,user3,course0
1,329,2843,g3902agYIRHzac78MKVZlvljLMXXNpQ7,5Gyp41oLVo7Gg7vF4vpmggWP5MU70QO6,user16,course12
2,641,5311,SFbTZDEo0JTOG7VHfxeJz2kz3qSzlHq4,SpATywNh6bZuzm8s1ceuBUnMUAeoAHHw,user8,course5
3,890,7447,35E85h2ExBPeTd1VmxNX0GKo8krIMsCP,H2lDW05SyKnwntZ6Fora76aPAEswcMa5,user28,course3
4,1016,8738,yZ9FndAKqVsKwZtO1B2JJJblefdbhZ02,H2lDW05SyKnwntZ6Fora76aPAEswcMa5,user2,course3


In [48]:
users = test_df.user.unique()
users[:10]

array(['user3', 'user16', 'user8', 'user28', 'user2', 'user0', 'user10',
       'user19', 'user20', 'user5'], dtype=object)

In [49]:
courses = test_df.course.unique()
courses[:10]

array(['course0', 'course12', 'course5', 'course3', 'course8', 'course4',
       'course6', 'course2', 'course7', 'course11'], dtype=object)

In [50]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

Unnamed: 0,user
0,user3
1,user16
2,user8
3,user28
4,user2


In [51]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

(11,
 ['course0',
  'course12',
  'course5',
  'course3',
  'course8',
  'course4',
  'course6',
  'course2',
  'course7',
  'course11'])

In [52]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((test_df.user == u) & (test_df.course == c)).any():
            items_value[c][i] = 1

In [53]:
for col in items_columns:
    new_df[col] = items_value[col]

In [54]:
new_df.head()

Unnamed: 0,user,course0,course12,course5,course3,course8,course4,course6,course2,course7,course11,course10
0,user3,1,0,0,0,0,0,0,1,0,0,0
1,user16,0,1,1,0,0,0,0,0,0,0,0
2,user8,0,0,1,0,0,0,0,0,0,0,0
3,user28,0,0,0,1,0,0,0,0,0,0,0
4,user2,0,0,0,1,0,0,0,0,0,0,0


In [55]:
new_df['course0'][:30]

0     1
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    1
21    0
22    0
23    0
24    0
25    0
26    0
27    0
Name: course0, dtype: int64

In [56]:
new_df.to_csv('enroll_matrix_test.csv', index=False)