In [86]:
import pandas as pd
import numpy as np
import math

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

## create enroll matrix

In [45]:
# Total records
train_df = pd.read_csv('./test/enroll_test.csv')

In [46]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,enrollment_id,username,course_id,user,course
0,25,193,tp9GLoYyljDV6mILQxrtrv1Iw4qcPl5Z,KHPw0gmg1Ad3V07TqRpyBzA8mRjj7mkt,user655,course11
1,38,334,uQjl8NGtIQkRfoe9edBiMgtYYmdkzVXR,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,user660,course2
2,75,677,DBl87nF6WTgoCD6kgTHVE2oB45pDEn6y,DPnLzkJJqOOPRJfBxIHbQEERiYHu5ila,user160,course2
3,168,1424,VBP5PzVQEKJYFH0y5ffEek5WfP9EW7uc,AXUJZGmZ0xaYSWazu8RQ1G5c76ECT1Kd,user388,course1
4,173,1491,lCyf3TRKW0i3K1mBgKz23OONMdCWqNsI,7GRhBDsirIGkRZBtSMEzNTyDr2JQm4xx,user561,course29


In [47]:
users = train_df.user.unique()
users[:10]

array(['user655', 'user660', 'user160', 'user388', 'user561', 'user168',
       'user137', 'user672', 'user63', 'user718'], dtype=object)

In [48]:
courses = train_df.course.unique()
courses[:10]

array(['course11', 'course2', 'course1', 'course29', 'course22',
       'course23', 'course24', 'course3', 'course30', 'course37'],
      dtype=object)

In [None]:
new_df = pd.DataFrame(users, columns = ['user'])
new_df.head()

In [None]:
items_columns = [c for c in courses]
len(items_columns), items_columns[:10]

In [None]:
items_value = {}
for c in courses:
    items_value[c] = np.zeros(len(users), dtype=int)
    for i, u in enumerate(users):
        if ((train_df.user == u) & (train_df.course == c)).any():
            items_value[c][i] = 1

In [None]:
for col in items_columns:
    new_df[col] = items_value[col]

In [None]:
new_df.head()

In [None]:
new_df['course0'][:30]

In [None]:
new_df.to_csv('enroll_matrix_test.csv', index=False)

### magnitude normalize

In [3]:
data = pd.read_csv('enroll_matrix.csv')

In [4]:
data.head()

Unnamed: 0,user,course0,course1,course2,course3,course4,course5,course6,course7,course8,...,course29,course30,course31,course32,course33,course34,course35,course36,course37,course38
0,user0,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,user1,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,user2,0,0,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,user3,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,user4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
data_items = data.drop('user', 1)

In [6]:
data_items.head()

Unnamed: 0,course0,course1,course2,course3,course4,course5,course6,course7,course8,course9,...,course29,course30,course31,course32,course33,course34,course35,course36,course37,course38
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Ví dụ user0 đã enroll 5 course
# Normalize 1/ sqrt(5) = 0.4472135954999579
data_items.iloc[0].head()

course0    1
course1    1
course2    1
course3    1
course4    1
Name: 0, dtype: int64

In [9]:
#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_items).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = data_items.divide(magnitude, axis='index')

In [72]:
data_items['course0'].head()

0    0.447214
1    0.000000
2    0.000000
3    0.000000
4    0.000000
Name: course0, dtype: float64

### Calculate similarity 
Sau khi normalize, tính toán similary giữa các course

### C1: Dùng thư viện

In [11]:
def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(data_items)

In [12]:
data_matrix.head()

Unnamed: 0,course0,course1,course2,course3,course4,course5,course6,course7,course8,course9,...,course29,course30,course31,course32,course33,course34,course35,course36,course37,course38
course0,1.0,0.25543,0.228659,0.09987,0.30239,0.097112,0.134791,0.252821,0.100953,0.081259,...,0.117115,0.158593,0.04847,0.156078,0.005749,0.034251,0.102677,0.074796,0.128417,0.118733
course1,0.25543,1.0,0.362162,0.117666,0.270566,0.093264,0.111364,0.304821,0.188718,0.141184,...,0.145529,0.189447,0.063701,0.184,0.064464,0.04392,0.092525,0.046223,0.163288,0.148123
course2,0.228659,0.362162,1.0,0.109384,0.22377,0.13255,0.100953,0.26025,0.144728,0.114774,...,0.15019,0.127854,0.057839,0.157235,0.050331,0.035229,0.086854,0.054597,0.132103,0.125794
course3,0.09987,0.117666,0.109384,1.0,0.140048,0.057933,0.037545,0.154072,0.082974,0.158858,...,0.10638,0.060202,0.192617,0.04844,0.119077,0.116081,0.19658,0.131765,0.145645,0.133472
course4,0.30239,0.270566,0.22377,0.140048,1.0,0.086773,0.196754,0.273517,0.219904,0.151197,...,0.098708,0.193882,0.124539,0.180991,0.08102,0.065841,0.098127,0.14814,0.099005,0.115676


### C2: Tính toán 
Ví dụ tính toán cosine giữa 2 course 0, 1

In [84]:
data_items.iloc[:10, :10]

Unnamed: 0,course0,course1,course2,course3,course4,course5,course6,course7,course8,course9
0,0.447214,0.447214,0.447214,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.447214,0.447214,0.447214,0.447214,0.447214
2,0.0,0.0,0.408248,0.0,0.0,0.408248,0.0,0.408248,0.408248,0.0
3,0.0,0.0,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248
5,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.408248,0.408248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.447214
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214


In [92]:
c0 = data_items['course0']

In [93]:
c1 = data_items['course1']

In [94]:
c0[:5], c1[:5], len(c0)

(0    0.447214
 1    0.000000
 2    0.000000
 3    0.000000
 4    0.000000
 Name: course0, dtype: float64, 0    0.447214
 1    0.000000
 2    0.000000
 3    0.000000
 4    0.000000
 Name: course1, dtype: float64, 720)

In [95]:
sim01 = np.dot(c0, c1.T) / (math.sqrt(np.dot(c0, c0.T)) * math.sqrt(np.dot(c1, c1.T)))
sim01

0.25542984243869366

In [70]:
(c0**2)[:10]

0    0.200000
1    0.000000
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.166667
7    0.000000
8    0.000000
9    0.000000
Name: course0, dtype: float64

### Dự đoán