## Recommendation System Algorithm

### 1. Kết nối với CSDL và xử lý data

In [1]:
import numpy as np
import pandas as pd
import torch
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [2]:
# determine the supported device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

get_device()

device(type='cuda', index=0)

Kết nối với MongoDB

In [3]:
# uri = "mongodb+srv://<username>:<password>@cluster0.jmil5cr.mongodb.net/" # Thay user, pass của mình vào 
uri = "mongodb+srv://user1:test@cluster0.jmil5cr.mongodb.net/" 

# Connect to server
client = MongoClient(uri, server_api=ServerApi('1'))

# Check if the connection is successful
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


Kết nối với CSDL

In [4]:
db = client['dtu']

In [5]:
history = db["answered_questions"] # Collection contains the 100 most recent questions the player has played

Cấu trúc dữ liệu

In [6]:
pipeline = [{"$unwind": "$questions"}, 
            {"$project": {"_id": 0, 
                          "player": "$playerId._id", 
                          "question": "$questions._id", 
                          "player_major": "$playerId.major", 
                          "player_rank": "$playerId.rank", 
                          "question_diff": "$questions.difficulty",
                          "question_category": "$questions.category",
                          "time": "$questions.timeForAnswer",
                          "status": "$questions.status"}}]

In [7]:
# Get data from DB
data = history.aggregate(pipeline)

Chuyển thành DataFrame

In [8]:
df = pd.DataFrame(list(history.aggregate(pipeline)))

In [9]:
df.head()

Unnamed: 0,player,question,player_major,player_rank,question_diff,question_category,time,status
0,65fbfc409a31efcf7a3f9e6d,65fbfb83b5440169b33e0950,"[His, Physics, Geo, Literature]",8,1,Literature,23,1
1,65fbfc409a31efcf7a3f9e6d,65fbfb83b5440169b33df7ab,"[His, Physics, Geo, Literature]",8,4,Physics,8,1
2,65fbfc409a31efcf7a3f9e6d,65fbf56a4dba71a085a1e048,"[His, Physics, Geo, Literature]",8,5,Geo,12,0
3,65fbfc409a31efcf7a3f9e6d,65fbf56a4dba71a085a1e4c2,"[His, Physics, Geo, Literature]",8,2,Physics,22,1
4,65fbfc409a31efcf7a3f9e6d,65fbfb83b5440169b33e1019,"[His, Physics, Geo, Literature]",8,1,Literature,27,1


In [10]:
df_players_per_question = (
    df.groupby(["question"]).agg({"player": "nunique"}).reset_index()
)
df_players_per_question.columns = ["question", "num_of_players"]
df_players_per_question.head()

Unnamed: 0,question,num_of_players
0,65fbf56a4dba71a085a1d374,63
1,65fbf56a4dba71a085a1d375,61
2,65fbf56a4dba71a085a1d376,52
3,65fbf56a4dba71a085a1d377,46
4,65fbf56a4dba71a085a1d378,45


In [11]:
# Retrieve a list of questions that have more than n players (e.g: n = 2)
num_of_players_threshold = 2

mask = df_players_per_question["num_of_players"] >= num_of_players_threshold
valid_questions = set(df_players_per_question.loc[mask, "question"].tolist())

In [12]:
# filter invalid records
df_filter_ques = df[df["question"].isin(valid_questions)].copy()

In [13]:
df.shape

(1000000, 8)

In [14]:
df_filter_ques.shape

(1000000, 8)

### 2. Áp dụng thuật toán đề xuất

In [15]:
unique_players = df_filter_ques["player"].unique()
print(unique_players)
player_ids = dict(
    zip(unique_players, np.arange(unique_players.shape[0])))
print(player_ids)

unique_questions = df_filter_ques["question"].unique()
question_ids = dict(
    zip(unique_questions, np.arange(unique_questions.shape[0])))

df_filter_ques["player_id"] = df_filter_ques["player"].apply(
    lambda i: player_ids[i]
)
df_filter_ques["question_id"] = df_filter_ques["question"].apply(
    lambda i: question_ids[i]
)

[ObjectId('65fbfc409a31efcf7a3f9e6d') ObjectId('65fbfc409a31efcf7a3fb03a')
 ObjectId('65fbfc409a31efcf7a3fa76e') ...
 ObjectId('65fbfc409a31efcf7a3f9727') ObjectId('65fbfc409a31efcf7a3fb2d1')
 ObjectId('65fbfc409a31efcf7a3fad3c')]
{ObjectId('65fbfc409a31efcf7a3f9e6d'): 0, ObjectId('65fbfc409a31efcf7a3fb03a'): 1, ObjectId('65fbfc409a31efcf7a3fa76e'): 2, ObjectId('65fbfc409a31efcf7a3f9666'): 3, ObjectId('65fbfc409a31efcf7a3facf0'): 4, ObjectId('65fbfc409a31efcf7a3f9584'): 5, ObjectId('65fbfc409a31efcf7a3fa604'): 6, ObjectId('65fbfc409a31efcf7a3f948c'): 7, ObjectId('65fbfc409a31efcf7a3fae3c'): 8, ObjectId('65fbfc409a31efcf7a3f9b7d'): 9, ObjectId('65fbfc409a31efcf7a3f9d02'): 10, ObjectId('65fbfc409a31efcf7a3fa6b5'): 11, ObjectId('65fbfc409a31efcf7a3f93dc'): 12, ObjectId('65fbfc409a31efcf7a3fb085'): 13, ObjectId('65fbfc409a31efcf7a3f992c'): 14, ObjectId('65fbfc409a31efcf7a3faaff'): 15, ObjectId('65fbfc409a31efcf7a3fb43a'): 16, ObjectId('65fbfc409a31efcf7a3f9d81'): 17, ObjectId('65fbfc409a31

In [16]:
print("Number of players: ", len(player_ids))
print("Number of questions: ", len(question_ids))

Number of players:  6349
Number of questions:  20000


In [17]:
# Get player majors
unique_majors = df_filter_ques["player_major"].explode().unique()
unique_majors

array(['His', 'Physics', 'Geo', 'Literature', 'Math', 'Eng'], dtype=object)

In [18]:
# Get question categories
unique_categories = df_filter_ques['question_category'].unique()
unique_categories

array(['Literature', 'Physics', 'Geo', 'His', 'Math', 'Eng'], dtype=object)

Công thức tính "giá trị phù hợp":

$$rating = 0.2 \cdot performance + 0.3 \cdot similarity(player\_ rank, question\_ difficulty) + 0.5 \cdot similarity(player\_ major, question\_ category)$$

Công thức tính Performance:
$$performance = (1 - \frac{time\_ spent}{time\_ max}) * accuracy$$

In [19]:
def calculate_performance(time_spent, accuracy, difficulty): 
    time_max = 60 + 30 * difficulty
    return accuracy * (1 - time_spent/time_max)
performance = calculate_performance(df_filter_ques["time"], df_filter_ques["status"], df_filter_ques["question_diff"])

Các bước tính độ tương đồng giữa major và category: 
<br>
one hot encoding -> Encode vector (TF-IDF) -> Cosine similarity

In [20]:
# One hot encoding
def one_hot_encoding(df, column_name):
    encoded = pd.get_dummies(df[column_name].explode()) 
    encoded = encoded.groupby(encoded.index).sum()
    return encoded

one_hot_major = one_hot_encoding(df_filter_ques, "player_major") 
one_hot_category = one_hot_encoding(df_filter_ques, "question_category")

# Encode to vector using TF-IDF 
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf_major = transformer.fit_transform(one_hot_major.to_numpy().tolist()).toarray()
tfidf_category = transformer.fit_transform(one_hot_category.to_numpy().tolist()).toarray()

# Cosine similarity between player major and question category
sim_player_question = torch.nn.functional.cosine_similarity(torch.tensor(tfidf_major.astype(np.float32)).to(get_device()), 
                                                            torch.tensor(tfidf_category.astype(np.float32)).to(get_device()))
sim_player_question

tensor([0.5009, 0.5022, 0.4983,  ..., 0.5005, 0.5005, 0.5015], device='cuda:0')

Các bước tính độ tương đồng giữa player rank (0 -> 9) và question difficulty (1 -> 5): 
<br>
Normalize -> Absolute distance

In [21]:
# Tính similarity giữa player_rank (0-9) và question_diff (1-5)
MIN_RANK, MAX_RANK = 0, 9
MIN_DIFF, MAX_DIFF = 1, 5

def normalize(df, column_name, min_value, max_value):
    return df[column_name].apply(lambda x: (x - min_value) / (max_value - min_value))

# Normalize
normed_rank = normalize(df, "player_rank", MIN_RANK, MAX_RANK)
normed_diff = normalize(df, "question_diff", MIN_DIFF, MAX_DIFF)

max_value = pd.concat([normed_rank, normed_diff], axis=1).max(axis=1)

# Calculate similarity between player rank and question difficulty
sim_rank_diff = (1 - (normed_rank - normed_diff).abs() / max_value)
sim_rank_diff

0         0.000000
1         0.843750
2         0.888889
3         0.281250
4         0.000000
            ...   
999995    0.675000
999996    0.900000
999997    0.000000
999998    0.675000
999999    0.900000
Length: 1000000, dtype: float64

In [22]:
# Calculate rating
rating = (0.2 * performance) + (0.3 * sim_rank_diff)+ (0.5 * sim_player_question.cpu().numpy())
rating

0         0.399359
1         0.695349
2         0.515806
3         0.498822
4         0.390471
            ...   
999995    0.619179
999996    0.520098
999997    0.250258
999998    0.643869
999999    0.520740
Length: 1000000, dtype: float64

In [23]:
rating.name = "rating"

In [24]:
df_player_ques_rating = pd.concat([df_filter_ques[["player_id", "question_id"]], rating], axis=1)

Đối với những người chơi trả lời lại một câu hỏi đã trả lời trước đó thì ta sẽ lấy mean rating

In [25]:
df_player_ques_rating = df_player_ques_rating.groupby(["player_id", "question_id"]).agg({"rating": "mean"}).reset_index()

In [26]:
df_player_ques_rating.head()

Unnamed: 0,player_id,question_id,rating
0,0,0,0.399359
1,0,1,0.695349
2,0,2,0.515806
3,0,3,0.498822
4,0,4,0.390471


### 3. Turi framework

In [27]:
import turicreate as tc

In [28]:
df_player_ques_rating_test = df_player_ques_rating.head(500000) 
df_player_ques_rating_test

Unnamed: 0,player_id,question_id,rating
0,0,0,0.399359
1,0,1,0.695349
2,0,2,0.515806
3,0,3,0.498822
4,0,4,0.390471
...,...,...,...
499995,2736,12437,0.635973
499996,2736,12475,0.514979
499997,2736,12609,0.649306
499998,2736,12744,0.249639


In [29]:
actions = tc.SFrame(df_player_ques_rating_test)

In [30]:
# model = tc.recommender.create(actions, target="rating", user_id='player_id', item_id='question_id') 
model = tc.factorization_recommender.create(actions, target='rating', user_id='player_id', item_id='question_id')

In [31]:
with open('test.txt', 'w') as f:
    for user in df_player_ques_rating_test["player_id"].unique():
        recs = model.recommend(users=[user], k=5)
        f.write(str(recs) + '\n')
        # print(recs)