# BOJ에서 유저가이 푼 문제 목록 정리

In [3]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import bottleneck as bn
import pickle
import time 
import random
import math
from sklearn.model_selection import train_test_split
from scipy import sparse

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

def add_ids(page_num : int, user_ids: list, sample_len : int = 5):
    data = requests.get(f'https://www.acmicpc.net/ranklist/{page_num}', headers=headers)
    soup = BeautifulSoup(data.text, 'html.parser')
    trs = soup.select('tbody > tr')

    user_len = len(trs) # 최대 100명 임
    
    if user_len < sample_len :
        sample_len = user_len

    sample_nums = random.sample(range(0,user_len),sample_len)

    for num in sample_nums:
        tr = trs[num]
        user_ids.append(tr.select_one('td:nth-child(2) > a').text)
        
        
def add_group_ids(group_num,page_num, user_ids : list):
    data = requests.get(f'https://www.acmicpc.net/school/ranklist/{group_num}/{page_num}', headers=headers)
    soup = BeautifulSoup(data.text, 'html.parser')
    trs = soup.select('tbody > tr')

    for tr in trs:
        user_ids.append(tr.select_one('td:nth-child(2) > a').text)

def add_to_user_problem_mat(idx, id, user_problem_mat : np.array):
    data = requests.get(f'https://www.acmicpc.net/user/{id}', headers=headers)
    soup = BeautifulSoup(data.text, 'html.parser')
    trs = soup.select('div.problem-list')

    for tr in trs:
        problem_nums = tr.select('a')
            
        for problem_num in problem_nums :

            problem_num = int(problem_num.text) - 1000
            #print(problem_num)
            try:
                user_problem_mat[idx,problem_num] = 1
            except:
                print("범위를 벗어난 문제 번호 : " + str(problem_num))
                
def gen_user_problem_mat(id, user_problem : dict, problem_num_set : set):
    data = requests.get(f'https://www.acmicpc.net/user/{id}', headers=headers)
    soup = BeautifulSoup(data.text, 'html.parser')
    trs = soup.select('div.problem-list')

    time.sleep(0.1)
    
    user_problem[id] = []
    
    for tr in trs:
        problem_nums = tr.select('a')
        for problem_num in problem_nums :
            problem_num = int(problem_num.text)
            #print(problem_num)
            user_problem[id].append(problem_num)
            problem_num_set.add(problem_num)
                
def add_to_tag_pb(pb_nums : str, tag_pb : dict, tag_name_to_id : dict, num_problem: int):    
    resp = requests.get(f"https://solved.ac/api/v3/problem/lookup?problemIds={pb_nums}", headers = {'Content-Type': 'application/json'})
    
    if resp.status_code != 200: 
        print(f'{resp.status_code} : {pb_num} - error!!!')
        return
    
    jsons = resp.json()
    #print(resp)
    time.sleep(1)
    
    for json in jsons:
        tags = json['tags']
        pb_num = json['problemId']
        tag_pb[pb_num] = []
        for tag in tags:
            for displayName in tag['displayNames']:
                if displayName['language'] == 'ko' :
                    tag_name = tag['displayNames'][0]['name']
                    break
            tag_pb[pb_num].append(tag_name)
            tag_name_to_id[tag_name] = tag['bojTagId']
    

In [3]:
st_pb_num = 1000
ed_pb_num = 27981
num_problem = ed_pb_num - st_pb_num + 1 # 1000 ~ 27981

# 태그 분석

In [4]:
# 태그-문제 매트릭 생성
tag_pb = {}
tag_name_to_id = {}
num_sample_pbs = num_problem
num_slice = 100
#rand_pbs = np.random.randint(st_pb_num, ed_pb_num, num_sample_pbs)
rand_pbs = list(range(st_pb_num,ed_pb_num+1))

In [5]:
for idx in tqdm(range(0,num_sample_pbs,num_slice)) :
    rand_pbs_str = ','.join(str(e) for e in rand_pbs[idx:idx+num_slice])
    add_to_tag_pb(rand_pbs_str, tag_pb, tag_name_to_id, num_problem)    

100%|████████████████████████████████████████████████████████████████████████████████| 270/270 [05:11<00:00,  1.15s/it]


In [8]:
tags = tag_name_to_id.keys()
print(len(tag_pb.keys()))
tag_pb_mat = {}
for pb in tag_pb.keys():
    tag_pb_mat[pb] = {tag:0 for tag in tags}
    for tag in tag_pb[pb]:
        tag_pb_mat[pb][tag] = 1    

26188


In [None]:
tag_list_all = np.array(list(tag_pb_mat.keys()))
np.save('./tag_list_all.npy',tag_list_all)

sparse_coo = sparse.coo_matrix(tag_problem_mat)
sparse.save_npz('./tag_problem_mat_all.npz', sparse_coo)

In [4]:
## 태그 분석 ##

# pd.set_option('display.max_columns', None) # 모든 열 출력
# pd.options.display.max_rows = 60 # 행 출력 초기화
# pd.options.display.max_columns = 20 # 열 출력 초기화

def get_ranking(mat, selected_tags, topk = 4):
    tag_corr_ranking = {}
    for tag in selected_tags:
        tag_corr_ranking[tag]= list(mat[tag].sort_values(ascending=False).index)[1:1+topk]
    return tag_corr_ranking

selected_tags = ['그리디 알고리즘', '다이나믹 프로그래밍', '브루트포스 알고리즘', '이분 탐색',
                '너비 우선 탐색', '깊이 우선 탐색', '데이크스트라', '플로이드–워셜', '비트마스킹', '분리 집합']

In [43]:
# tag_problem_mat = pd.read_csv('./dataset/tag_problem_mat_all.csv', index_col=0)
# tag_list_all = np.array(tag_problem_mat.index)
# np.save('./tag_list_all.npy',tag_list_all)
# prob_num_list_all = np.array(tag_problem_mat.columns)
# np.save('./prob_num_list_all.npy',prob_num_list_all)

In [None]:
tag_problem_mat = sparse.load_npz('./dataset/tag_problem_mat_all.npz').toarray()
tag_list_all = np.load('./tag_list_all.npy', allow_pickle=True)

In [24]:
tag_problem_mat_trans = tag_problem_mat.T
tag_problem_mat_trans = tag_problem_mat_trans[selected_tags]
tag_problem_mat = tag_problem_mat_trans.T

In [26]:
def get_problems_by_category(tag_name, tag_problem_mat):
    selected_probs_by_tag = []
    for tag_pro_row in tag_problem_mat:
        if tag_problem_mat[tag_pro_row][tag_name]:
            selected_probs_by_tag.append(int(tag_pro_row) - 1000)
    return selected_probs_by_tag

def set_tag_problem(tag_problem_mat):
    selected_probs_by_tags = {}
    idx_to_num = {}

    for tag in selected_tags:
        selected_probs_by_tags[tag] = get_problems_by_category(tag, tag_problem_mat)
        idx_to_num[tag] = dict(zip(range(len(selected_probs_by_tags[tag])), selected_probs_by_tags[tag]))

    return selected_probs_by_tags,idx_to_num

selected_probs_by_tags,idx_to_num = set_tag_problem(tag_problem_mat)

In [34]:
for key, value in selected_probs_by_tags.items():
    print(key + " - " + str(len(value)))

그리디 알고리즘 - 1841
다이나믹 프로그래밍 - 3140
브루트포스 알고리즘 - 1681
이분 탐색 - 930
너비 우선 탐색 - 803
깊이 우선 탐색 - 668
데이크스트라 - 446
플로이드–워셜 - 141
비트마스킹 - 565
분리 집합 - 355


In [16]:
pearson = tag_problem_mat_trans.corr()

In [17]:
pearson

Unnamed: 0,그리디 알고리즘,다이나믹 프로그래밍,브루트포스 알고리즘,이분 탐색,너비 우선 탐색,깊이 우선 탐색,데이크스트라,플로이드–워셜,비트마스킹,분리 집합
그리디 알고리즘,1.0,0.005177,-0.01656,0.01745,-0.030713,0.011406,-0.020033,-0.01615,-0.004852,0.011681
다이나믹 프로그래밍,0.005177,1.0,-0.046789,0.000312,-0.024056,0.016332,0.000476,-0.004668,0.1693,-0.018872
브루트포스 알고리즘,-0.01656,-0.046789,1.0,-0.021632,0.003123,0.007037,-0.017615,0.008408,0.071561,-0.025313
이분 탐색,0.01745,0.000312,-0.021632,1.0,-0.006602,-0.014032,0.003447,-0.005659,-0.005772,0.032818
너비 우선 탐색,-0.030713,-0.024056,0.003123,-0.006602,1.0,0.269051,0.055335,0.023234,0.033043,0.017458
깊이 우선 탐색,0.011406,0.016332,0.007037,-0.014032,0.269051,1.0,-0.010064,0.017883,0.00098,0.066907
데이크스트라,-0.020033,0.000476,-0.017615,0.003447,0.055335,-0.010064,1.0,0.070975,0.017018,0.002435
플로이드–워셜,-0.01615,-0.004668,0.008408,-0.005659,0.023234,0.017883,0.070975,1.0,0.010623,-0.004113
비트마스킹,-0.004852,0.1693,0.071561,-0.005772,0.033043,0.00098,0.017018,0.010623,1.0,-0.012862
분리 집합,0.011681,-0.018872,-0.025313,0.032818,0.017458,0.066907,0.002435,-0.004113,-0.012862,1.0


In [22]:
get_ranking(pearson, selected_tags)

{'그리디 알고리즘': ['이분 탐색', '깊이 우선 탐색', '다이나믹 프로그래밍', '플로이드–워셜'],
 '다이나믹 프로그래밍': ['깊이 우선 탐색', '그리디 알고리즘', '이분 탐색', '데이크스트라'],
 '브루트포스 알고리즘': ['플로이드–워셜', '깊이 우선 탐색', '너비 우선 탐색', '그리디 알고리즘'],
 '이분 탐색': ['그리디 알고리즘', '데이크스트라', '다이나믹 프로그래밍', '플로이드–워셜'],
 '너비 우선 탐색': ['깊이 우선 탐색', '데이크스트라', '플로이드–워셜', '브루트포스 알고리즘'],
 '깊이 우선 탐색': ['너비 우선 탐색', '다이나믹 프로그래밍', '플로이드–워셜', '그리디 알고리즘'],
 '데이크스트라': ['플로이드–워셜', '너비 우선 탐색', '이분 탐색', '다이나믹 프로그래밍'],
 '플로이드–워셜': ['데이크스트라', '너비 우선 탐색', '깊이 우선 탐색', '브루트포스 알고리즘']}

In [62]:
intersect = tag_problem_mat.dot(tag_problem_mat_trans)
#intersect.to_csv("./tag_tag_mat.csv")

In [63]:
intersect

Unnamed: 0,그리디 알고리즘,다이나믹 프로그래밍,브루트포스 알고리즘,이분 탐색,너비 우선 탐색,깊이 우선 탐색,데이크스트라,플로이드–워셜,비트마스킹,분리 집합
그리디 알고리즘,1257,144,66,59,9,41,9,1,22,23
다이나믹 프로그래밍,144,2312,82,76,37,68,38,10,211,16
브루트포스 알고리즘,66,82,1289,26,43,44,8,9,82,3
이분 탐색,59,76,26,663,19,9,16,2,10,26
너비 우선 탐색,9,37,43,19,606,170,40,11,25,16
깊이 우선 탐색,41,68,44,9,170,480,7,8,8,31
데이크스트라,9,38,8,16,40,7,348,15,12,5
플로이드–워셜,1,10,9,2,11,8,15,119,6,1
비트마스킹,22,211,82,10,25,8,12,6,409,2
분리 집합,23,16,3,26,16,31,5,1,2,258


In [64]:
Jaccard = {}
for in_tag in selected_tags:
    Jaccard[in_tag] = {}
    for out_tag in selected_tags:
        union = intersect[in_tag][in_tag] + intersect[out_tag][out_tag] - intersect[in_tag][out_tag]
        Jaccard[in_tag][out_tag] = intersect[in_tag][out_tag] / union

In [65]:
Jaccard = pd.DataFrame(Jaccard)

In [66]:
Jaccard

Unnamed: 0,그리디 알고리즘,다이나믹 프로그래밍,브루트포스 알고리즘,이분 탐색,너비 우선 탐색,깊이 우선 탐색,데이크스트라,플로이드–워셜,비트마스킹,분리 집합
그리디 알고리즘,1.0,0.042044,0.026613,0.031703,0.004854,0.024175,0.005639,0.000727,0.013382,0.015416
다이나믹 프로그래밍,0.042044,1.0,0.023302,0.026216,0.012843,0.024963,0.014493,0.004131,0.084064,0.006265
브루트포스 알고리즘,0.026613,0.023302,1.0,0.013499,0.023218,0.025507,0.004911,0.006433,0.050743,0.001943
이분 탐색,0.031703,0.026216,0.013499,1.0,0.0152,0.007937,0.01608,0.002564,0.009416,0.02905
너비 우선 탐색,0.004854,0.012843,0.023218,0.0152,1.0,0.18559,0.043764,0.015406,0.025253,0.018868
깊이 우선 탐색,0.024175,0.024963,0.025507,0.007937,0.18559,1.0,0.008526,0.013536,0.009081,0.043847
데이크스트라,0.005639,0.014493,0.004911,0.01608,0.043764,0.008526,1.0,0.033186,0.016107,0.008319
플로이드–워셜,0.000727,0.004131,0.006433,0.002564,0.015406,0.013536,0.033186,1.0,0.011494,0.00266
비트마스킹,0.013382,0.084064,0.050743,0.009416,0.025253,0.009081,0.016107,0.011494,1.0,0.003008
분리 집합,0.015416,0.006265,0.001943,0.02905,0.018868,0.043847,0.008319,0.00266,0.003008,1.0


In [104]:
new_Jaccard = {}
for in_tag in selected_tags:
    new_Jaccard[in_tag] = {}
    for out_tag in selected_tags:
        new_Jaccard[in_tag][out_tag] = intersect[in_tag][out_tag] / intersect[in_tag][in_tag]

In [105]:
new_Jaccard = pd.DataFrame(new_Jaccard)

In [106]:
new_Jaccard.T

Unnamed: 0,그리디 알고리즘,다이나믹 프로그래밍,브루트포스 알고리즘,이분 탐색,너비 우선 탐색,깊이 우선 탐색,데이크스트라,플로이드–워셜,비트마스킹,분리 집합
그리디 알고리즘,1.0,0.114558,0.052506,0.046937,0.00716,0.032617,0.00716,0.000796,0.017502,0.018298
다이나믹 프로그래밍,0.062284,1.0,0.035467,0.032872,0.016003,0.029412,0.016436,0.004325,0.091263,0.00692
브루트포스 알고리즘,0.051202,0.063615,1.0,0.020171,0.033359,0.034135,0.006206,0.006982,0.063615,0.002327
이분 탐색,0.088989,0.11463,0.039216,1.0,0.028658,0.013575,0.024133,0.003017,0.015083,0.039216
너비 우선 탐색,0.014851,0.061056,0.070957,0.031353,1.0,0.280528,0.066007,0.018152,0.041254,0.026403
깊이 우선 탐색,0.085417,0.141667,0.091667,0.01875,0.354167,1.0,0.014583,0.016667,0.016667,0.064583
데이크스트라,0.025862,0.109195,0.022989,0.045977,0.114943,0.020115,1.0,0.043103,0.034483,0.014368
플로이드–워셜,0.008403,0.084034,0.07563,0.016807,0.092437,0.067227,0.12605,1.0,0.05042,0.008403
비트마스킹,0.05379,0.515892,0.200489,0.02445,0.061125,0.01956,0.02934,0.01467,1.0,0.00489
분리 집합,0.089147,0.062016,0.011628,0.100775,0.062016,0.120155,0.01938,0.003876,0.007752,1.0


In [97]:
jac = np.array(new_Jaccard)

In [98]:
np.fill_diagonal(jac,0)

In [100]:
np.max(jac, axis=1)

array([0.11455847, 0.09126298, 0.06361521, 0.11463047, 0.28052805,
       0.35416667, 0.11494253, 0.12605042, 0.51589242, 0.12015504])

In [107]:
get_ranking(new_Jaccard, selected_tags)

{'그리디 알고리즘': ['다이나믹 프로그래밍', '브루트포스 알고리즘', '이분 탐색', '깊이 우선 탐색'],
 '다이나믹 프로그래밍': ['비트마스킹', '그리디 알고리즘', '브루트포스 알고리즘', '이분 탐색'],
 '브루트포스 알고리즘': ['다이나믹 프로그래밍', '비트마스킹', '그리디 알고리즘', '깊이 우선 탐색'],
 '이분 탐색': ['다이나믹 프로그래밍', '그리디 알고리즘', '브루트포스 알고리즘', '분리 집합'],
 '너비 우선 탐색': ['깊이 우선 탐색', '브루트포스 알고리즘', '데이크스트라', '다이나믹 프로그래밍'],
 '깊이 우선 탐색': ['너비 우선 탐색', '다이나믹 프로그래밍', '브루트포스 알고리즘', '그리디 알고리즘'],
 '데이크스트라': ['너비 우선 탐색', '다이나믹 프로그래밍', '이분 탐색', '플로이드–워셜'],
 '플로이드–워셜': ['데이크스트라', '너비 우선 탐색', '다이나믹 프로그래밍', '브루트포스 알고리즘'],
 '비트마스킹': ['다이나믹 프로그래밍', '브루트포스 알고리즘', '너비 우선 탐색', '그리디 알고리즘'],
 '분리 집합': ['깊이 우선 탐색', '이분 탐색', '그리디 알고리즘', '다이나믹 프로그래밍']}

# 유저-문제 분석

In [11]:
from scipy import sparse
import scipy

In [4]:
## 유저 랭킹 페이지에서 선택 ##
page_st = 31
page_ed = 280

user_ids = []

for num in tqdm(range(page_st, page_ed+1)):
    add_ids(num, user_ids,10)

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [02:47<00:00,  1.49it/s]


In [5]:
start = time.time()

user_problem_mat = np.zeros((len(user_ids),num_problem), dtype=np.float16)

for idx, id in tqdm(enumerate(user_ids), position=0):
    add_to_user_problem_mat(idx, id, user_problem_mat)

end = time.time()

print(f"{end - start:.5f} sec")

67it [00:27,  2.20it/s]

범위를 벗어난 문제 번호 : 27014


79it [00:32,  2.24it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


100it [00:40,  2.72it/s]

범위를 벗어난 문제 번호 : 26986
범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017
범위를 벗어난 문제 번호 : 27018
범위를 벗어난 문제 번호 : 27019
범위를 벗어난 문제 번호 : 27016


107it [00:43,  2.65it/s]

범위를 벗어난 문제 번호 : 27014


138it [00:55,  2.61it/s]

범위를 벗어난 문제 번호 : 27014


156it [01:03,  1.93it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27016


202it [01:25,  2.41it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


206it [01:27,  2.17it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


345it [02:24,  2.34it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017
범위를 벗어난 문제 번호 : 27018


357it [02:29,  2.35it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


418it [03:15,  2.54it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017
범위를 벗어난 문제 번호 : 27018
범위를 벗어난 문제 번호 : 27016


422it [03:17,  2.46it/s]

범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017
범위를 벗어난 문제 번호 : 27018
범위를 벗어난 문제 번호 : 27019


643it [04:45,  2.90it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27016
범위를 벗어난 문제 번호 : 27017


695it [05:04,  2.73it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017
범위를 벗어난 문제 번호 : 27018


743it [05:23,  2.42it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


746it [05:24,  2.41it/s]

범위를 벗어난 문제 번호 : 27014


868it [06:13,  2.69it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


1063it [07:31,  1.75it/s]

범위를 벗어난 문제 번호 : 27017


1382it [09:58,  2.22it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


1669it [12:23,  2.01it/s]

범위를 벗어난 문제 번호 : 27014


1727it [12:55,  2.10it/s]

범위를 벗어난 문제 번호 : 27014


1806it [13:37,  2.35it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015


1844it [13:55,  2.33it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017


1977it [15:03,  2.19it/s]

범위를 벗어난 문제 번호 : 27014


2063it [15:53,  1.72it/s]

범위를 벗어난 문제 번호 : 27014


2091it [16:12,  1.56it/s]

범위를 벗어난 문제 번호 : 27014


2228it [17:28,  2.05it/s]

범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27018
범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27017


2313it [18:08,  1.57it/s]

범위를 벗어난 문제 번호 : 27015
범위를 벗어난 문제 번호 : 27014
범위를 벗어난 문제 번호 : 27017
범위를 벗어난 문제 번호 : 27018


2500it [19:35,  2.13it/s]

1175.89724 sec





In [8]:
sparse_coo = sparse.coo_matrix(user_problem_mat)

In [13]:
sparse.save_npz('./user_problem_mat.npz', sparse_coo)

In [26]:
train_user_problem_mat, test_user_problem_mat = train_test_split(user_problem_mat,
                                                                        test_size=0.3, 
                                                                        shuffle=True, 
                                                                        random_state=1004)

In [27]:
# sparse matrix로 저장
sparse_coo_train = sparse.coo_matrix(train_user_problem_mat)
sparse_coo_test = sparse.coo_matrix(test_user_problem_mat)

sparse.save_npz('./train_user_problem_mat.npz', sparse_coo_train)
sparse.save_npz('./test_user_problem_mat.npz', sparse_coo_test)

In [78]:
# dataframe으로 저장
idx_to_id = {}
id_to_idx = {}
for i, id in enumerate(user_ids):
    idx_to_id[i] = id
    id_to_idx[id] = i
    
df_train = pd.DataFrame(train_user_problem_mat)
df_train.to_csv("./train_user_problem_mat.csv")

df_test = pd.DataFrame(test_user_problem_mat)
df_test.to_csv("./test_user_problem_mat.csv")
#df_rename = df.rename(index=idx_to_id) #행(row) 이름 바꾸기
#df_rename.to_csv("./test_user_problem_mat.csv")

In [10]:
test_user_problem_mat.shape

(750, 26982)

In [111]:
## 한 그룹 내에서 선택 ##
group_num = 302 # 서울시립대학교
page_st = 1
page_ed = 4

user_ids = []

for num in tqdm(range(page_st, page_ed+1)):
    add_group_ids(group_num,num, user_ids)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.75it/s]


In [116]:
print(len(user_ids))

400


In [117]:
import math
import time

start = time.time()

user_problem = {}
problem_num_set = set([])

for id in tqdm(user_ids, position=0):
    gen_user_problem_mat(id, user_problem, problem_num_set)

end = time.time()

print(f"{end - start:.5f} sec")

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:36<00:00,  1.85it/s]

216.28883 sec





In [118]:
user_problem_mat = {}
for user, problems in user_problem.items():
    user_problem_mat[user] = {num:0 for num in problem_num_set}
    for problem in problems:
        user_problem_mat[user][problem] = 1    

In [119]:
df = pd.DataFrame(user_problem_mat).T

In [120]:
df

Unnamed: 0,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,...,27924,27925,27930,27931,27939,27940,27941,27942,27943,27944
iknoom1107,1,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
young920503,1,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
ljwljw8541,1,1,1,1,1,0,0,0,1,1,...,0,0,1,1,1,0,0,0,0,0
powergee,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
vjerksen,1,1,1,1,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
kriss1607,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
mk020408,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
leejoon924,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
dlatldud11,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
df.to_csv("./uos_user_problem_mat.csv")

# EASE 사용

### 아무 아이디로 추천

In [4]:
# 모델 불러오기
with open('./saved_model/ease_model.p', 'rb') as file:
    ease = pickle.load(file)

In [None]:
ease = EASE(300)
ease.B = sparse.load_npz('./saved_model/ease_model.npz').toarray()

In [7]:
NUM_TOP_PROBLEMS = 10
user_id = 'sem1308'

In [12]:
user_problem = np.zeros([1, num_problem])
add_to_user_problem_mat(0, user_id, user_problem)

result = ease.getUsersRating(user_problem)

In [15]:
result[user_problem.nonzero()] = -np.inf
top_problems_by_user = bn.argpartition(-result, NUM_TOP_PROBLEMS, axis=1)[:, :NUM_TOP_PROBLEMS] # 값이 큰 10개 문제 고름
top_problems_by_user += 1000
print(top_problems_by_user)

[[ 2741 10952  1008  1966  2178  2742  1931 10818  1107  2606]]


In [50]:
# 태그로 추천

selected_tags = ['그리디 알고리즘', '다이나믹 프로그래밍', '브루트포스 알고리즘', '이분 탐색',
                '너비 우선 탐색', '깊이 우선 탐색', '데이크스트라', '플로이드–워셜', '비트마스킹', '분리 집합']

dataset_dir = './dataset'
tp_mat = sparse.load_npz(f'{dataset_dir}/tag_problem_mat_all.npz').toarray()
tag_list_all = np.load(f'{dataset_dir}/tag_list_all.npy', allow_pickle=True)
prob_num_list_all = np.load(f'{dataset_dir}/prob_num_list_all.npy', allow_pickle=True)
tag_problem_mat = pd.DataFrame(tp_mat,index = tag_list_all, columns=prob_num_list_all)

In [51]:
tag_problem_mat

Unnamed: 0,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,...,27972,27973,27974,27975,27976,27977,27978,27979,27980,27981
구현,1,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
사칙연산,1,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
수학,1,1,1,0,1,0,0,1,1,1,...,0,1,0,0,0,0,0,0,0,1
많은 조건 분기,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
기하학,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
레드-블랙 트리,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
탑 트리,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
데카르트 트리,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
다항식 보간법,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
tag_problem_mat = tag_problem_mat[selected_tags].T

KeyError: "None of [Index(['그리디 알고리즘', '다이나믹 프로그래밍', '브루트포스 알고리즘', '이분 탐색', '너비 우선 탐색', '깊이 우선 탐색',\n       '데이크스트라', '플로이드–워셜', '비트마스킹', '분리 집합'],\n      dtype='object')] are in the [columns]"

In [8]:
tag_problem_mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26178,26179,26180,26181,26182,26183,26184,26185,26186,26187
그리디 알고리즘,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
다이나믹 프로그래밍,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
브루트포스 알고리즘,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
이분 탐색,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
너비 우선 탐색,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
깊이 우선 탐색,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
데이크스트라,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
플로이드–워셜,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
비트마스킹,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
분리 집합,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
tag_name = '그리디 알고리즘'
selected_probs_by_tag = []
for tag_pro_row in tag_problem_mat:
    if tag_problem_mat[tag_pro_row][tag_name]:
        selected_probs_by_tag.append(int(tag_pro_row)-1000)

In [100]:
idx_to_num = dict(zip(range(len(selected_probs_by_tag)),selected_probs_by_tag))

In [120]:
result[user_problem.nonzero()] = -np.inf
result = np.expand_dims(result[0][selected_probs_by_tag],axis=0)
top_idx_by_user = bn.argpartition(-result, NUM_TOP_PROBLEMS, axis=1)[:, :NUM_TOP_PROBLEMS][0] # 값이 큰 10개 문제 고름
top_problems_by_user = np.array([idx_to_num[idx] for idx in top_idx_by_user])
top_problems_by_user += 1000

In [121]:
top_problems_by_user

array([ 1080,  1439, 11399,  1931,  2212,  2839,  1105,  1339,  1082,
       10775])

In [113]:
top_problems_by_user += 1000

In [83]:
result[user_problem.nonzero()] = -np.inf
sorted(result)

[array([       -inf,        -inf,        -inf, ..., -0.00350231,
        -0.00335413, -0.00162263])]

In [12]:
result.shape

(1, 26918)

In [60]:
# 유저가 잘 풀지 않은 유형의 문제 추천 - 이건 비교적 부정확한 듯?
result[user_problem.nonzero()] = np.inf
top_problems_by_user_reverse = bn.argpartition(result, NUM_TOP_PROBLEMS, axis=1)[:, :NUM_TOP_PROBLEMS]
top_problems_by_user_reverse += 1000
print(top_problems_by_user_reverse)

[[ 2331 11931 10699 10819  2003 12851 10867 11051 12837  1735]]
