In [2]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"
from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')

In [3]:
dir = './remap/'
df_ratings = pd.read_csv(dir + 'userId_problemId_remap.csv', usecols=['userId', 'problemId'])
df_ratings = df_ratings.assign(solve=1)
df_ratings.drop_duplicates(inplace = True)

In [4]:
# 고유 사용자, 고유 문제 갯수 확인
n_users = len(df_ratings['userId'].unique())
n_problems = len(df_ratings['problemId'].unique())

n_users, n_problems

(2000, 5723)

In [5]:
# ratings의 기술통계량 확인
df_ratings['solve'].describe()

count    833388.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: solve, dtype: float64

In [6]:
A = df_ratings.pivot(index = 'problemId', columns = 'userId', values = 'solve').fillna(0).to_numpy()

In [7]:
pd.DataFrame(A)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SVD Model

In [8]:
k = 100
U, Sigma, VT = randomized_svd(A, n_components=k, n_iter='auto')

# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(U.shape, Sigma.shape, VT.shape)

A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_svd.shape)

(5723, 100) (100,) (100, 2000)
(5723, 2000)


In [9]:
pd.DataFrame(A_approx_svd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.282908,0.982457,1.012734,0.210409,0.799908,0.700536,1.020860,1.270019,1.183272,0.844995,...,1.020233,0.804079,0.999843,0.234471,0.409834,1.155394,1.001244,0.989859,1.040696,0.609728
1,1.162254,0.917811,0.874251,0.090846,0.934184,0.608458,1.034947,1.072972,0.995421,0.465680,...,0.966441,0.806583,1.027665,0.142309,0.270925,1.219290,1.049448,0.939293,1.055428,0.337898
2,0.782888,0.705934,0.639680,0.076158,0.964215,0.940499,1.047680,0.241833,0.166008,0.090939,...,0.818318,0.086393,0.530721,-0.020026,0.034672,0.086814,0.903771,0.028993,0.428280,-0.007616
3,0.901320,0.959503,0.943408,0.196422,0.936846,0.721684,1.002532,0.748526,1.039079,0.874505,...,0.815739,0.533975,0.502565,0.316679,0.743295,0.690068,1.029553,0.634880,0.953322,-0.068640
4,0.427167,0.765989,0.526817,0.042648,0.503224,1.071760,0.456225,0.130979,0.365263,-0.006430,...,0.709526,0.039322,0.483935,-0.066048,-0.131062,-0.077609,0.016371,0.088309,0.220896,0.000723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,0.051441,-0.001073,0.017357,0.005076,0.033870,0.000758,0.019908,-0.009021,0.017157,0.043919,...,0.025259,-0.016810,0.003443,0.024440,-0.000163,-0.045330,-0.005817,0.009730,-0.019410,0.007210
5719,0.023259,-0.003485,0.023122,-0.001141,0.010055,0.028461,0.012798,-0.017134,0.028003,-0.009291,...,-0.026556,0.024879,0.007238,0.009504,0.015687,-0.018428,-0.004872,0.030737,0.000136,-0.010049
5720,0.002039,-0.001532,-0.000214,0.002539,-0.000398,0.013730,0.003184,-0.004793,-0.009660,0.006475,...,-0.004732,0.004789,0.002894,0.000946,0.000853,-0.008926,0.007824,-0.001526,-0.002758,-0.006158
5721,-0.007944,0.000567,0.000433,0.003146,-0.012067,0.008116,0.008291,-0.011202,-0.009539,-0.003058,...,0.005308,-0.000183,0.007754,0.001780,-0.003955,0.000941,0.008443,-0.002872,-0.003231,0.002076


## NMF Model

In [10]:
k = 100
model_nmf = NMF(n_components = k, init='random', random_state=30, max_iter=100, l1_ratio=0.2).fit(A)
W = model_nmf.transform(A)
H = model_nmf.components_

# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(W.shape, H.shape)

A_approx_nmf = np.dot(W, H)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_nmf.shape)

(5723, 100) (100, 2000)
(5723, 2000)


In [11]:
pd.DataFrame(A_approx_nmf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.234905,1.009064,0.991519,0.227058,0.884733,0.708635,1.032982,1.264788,1.054260,0.620640,...,1.021434,0.863812,0.999083,0.273475,0.355025,1.083067,0.948448,0.949483,1.125569,0.558851
1,1.165478,1.029897,0.901042,0.117921,0.972153,0.716850,0.991197,1.044329,1.000846,0.331002,...,1.006412,0.859558,0.918994,0.139096,0.215827,1.192518,0.969082,0.815060,1.069942,0.269923
2,0.650699,0.906714,0.575303,0.060192,0.778371,0.912777,0.747917,0.261604,0.288642,0.094584,...,0.931594,0.167705,0.417756,0.031347,0.037876,0.202895,0.909500,0.062311,0.503359,0.008342
3,0.821452,0.936255,0.886502,0.165875,0.856148,0.781922,0.957815,0.823398,0.930054,0.803716,...,0.977258,0.619978,0.536405,0.400778,0.630744,0.897387,1.068478,0.673548,1.002710,0.120582
4,0.264714,0.620971,0.762412,0.025695,0.437070,1.357928,0.216951,0.068693,0.213221,0.045271,...,0.471023,0.149658,0.326184,0.012156,0.032437,0.059962,0.081512,0.000876,0.158456,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,0.009520,0.005785,0.026968,0.002345,0.002619,0.001614,0.022429,0.005318,0.022007,0.025987,...,0.007032,0.002132,0.003300,0.006352,0.003591,0.000608,0.012989,0.004667,0.001107,0.004408
5719,0.007226,0.000301,0.010274,0.000135,0.000161,0.000020,0.015418,0.002266,0.036186,0.018779,...,0.000033,0.003704,0.000665,0.002906,0.000284,0.000151,0.006510,0.005481,0.002249,0.000503
5720,0.001143,0.000496,0.005727,0.000005,0.001820,0.006173,0.002546,0.001614,0.001141,0.000286,...,0.000391,0.000195,0.001052,0.001169,0.000206,0.000130,0.001407,0.000000,0.002681,0.000376
5721,0.000644,0.002460,0.003805,0.000251,0.000370,0.001555,0.010912,0.000863,0.004367,0.008304,...,0.001666,0.001947,0.000272,0.000184,0.000695,0.000833,0.006671,0.000000,0.000832,0.000276


## Compute loss

In [12]:
def compute_error(actual, prediction):
    # 매개변수로 입력받은 actual 행렬 안의 0값을 갖는 원소들은 오차 계산에서 제외합니다.
    prediction = prediction[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    
    sse = np.sum(np.square(np.subtract(actual, prediction)))
    rmse = np.sqrt(np.square(np.subtract(actual, prediction)).mean())
    
    return sse, rmse

def compute_error_all(actual, prediction):
    # actual 행렬 안의 0값을 갖는 원소들도 포함해서 오차를 계산합니다.
    sse = np.sum(np.square(np.subtract(actual, prediction)))
    rmse = np.sqrt(np.square(np.subtract(actual, prediction)).mean())
                   
    return sse, rmse

In [13]:
print(f"SVD Error(ignoring zero values): SSE = {compute_error(A, A_approx_svd)[0]}, RMSE = {compute_error(A, A_approx_svd)[1]}")
print(f"NMF Error(ignoring zero values): SSE = {compute_error(A, A_approx_nmf)[0]}, RMSE = {compute_error(A, A_approx_nmf)[1]}")

print('\n')

print(f"SVD Error(including all zero values): SSE = {compute_error_all(A, A_approx_svd)[0]}, RMSE = {compute_error_all(A, A_approx_svd)[1]}")
print(f"NMF Error(including all zero values): SSE = {compute_error_all(A, A_approx_nmf)[0]}, RMSE = {compute_error_all(A, A_approx_nmf)[1]}")

SVD Error(ignoring zero values): SSE = 164907.8651033195, RMSE = 0.4448330669232854
NMF Error(ignoring zero values): SSE = 192826.18566987387, RMSE = 0.48101584643565326


SVD Error(including all zero values): SSE = 249651.0079229659, RMSE = 0.1476861574812087
NMF Error(including all zero values): SSE = 278869.4204541576, RMSE = 0.15608945932633245


### grid search

In [16]:
svd_rlt = dict()
nmf_rlt = dict()
for k in [50, 100, 150, 200]:
    print("grid search:", k)
    U, Sigma, VT = randomized_svd(A, n_components=k, n_iter='auto')
    A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)
    
    model_nmf = NMF(n_components = k, init='random', random_state=30, max_iter=100, l1_ratio=0.2).fit(A)
    W = model_nmf.transform(A)
    H = model_nmf.components_
    A_approx_nmf = np.dot(W, H)
    
    c_svd = compute_error(A, A_approx_svd)[1]
    c_nmf = compute_error(A, A_approx_nmf)[1]
    
    svd_rlt[k] = c_svd
    nmf_rlt[k] = c_nmf

grid search: 50
grid search: 100
grid search: 150
grid search: 200


In [17]:
svd_rlt

{50: 0.487408669093533,
 100: 0.4448330669232854,
 150: 0.4101553898643726,
 200: 0.38098933426351533}

In [18]:
nmf_rlt

{50: 0.5067141914358917,
 100: 0.48101584643565326,
 150: 0.4620809186384405,
 200: 0.44641222876533365}

In [19]:
U, Sigma, VT = randomized_svd(A, n_components=200, n_iter='auto')
A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)