# 1. Imports

In [1]:
import numpy as np
import pandas as pd
import math
import torch
import torch.nn.functional as F

# 2. Data

In [2]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,userId,movieId,title,genres,year,rating
0,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,4.0
1,1,3,Grumpier Old Men (1995),Comedy|Romance,1995,4.0
2,1,6,Heat (1995),Action|Crime|Thriller,1995,4.0
3,1,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995,5.0
4,1,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1995,5.0
...,...,...,...,...,...,...
80659,610,164179,Arrival (2016),Sci-Fi,2016,5.0
80660,610,166534,Split (2017),Drama|Horror|Thriller,2017,4.0
80661,610,168248,John Wick: Chapter Two (2017),Action|Crime|Thriller,2017,5.0
80662,610,168250,Get Out (2017),Horror,2017,5.0


In [3]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,rId,userId,movieId,title,genres,year
0,1,1,151,Rob Roy (1995),Action|Drama|Romance|War,1995
1,2,1,423,Blown Away (1994),Action|Thriller,1994
2,3,1,596,Pinocchio (1940),Animation|Children|Fantasy|Music,1940
3,4,1,673,Space Jam (1996),Adventure|Animation|Children|Comedy|Fantasy|Sc...,1996
4,5,1,1029,Dumbo (1941),Animation|Children|Drama|Music,1941
...,...,...,...,...,...,...
20161,20162,610,156371,Everybody Wants Some (2016),Comedy,2016
20162,20163,610,160836,Hazard (2005),Action|Drama|Thriller,2005
20163,20164,610,163937,Blair Witch (2016),Horror|Thriller,2016
20164,20165,610,166528,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi,2016


In [4]:
submission = pd.read_csv('submission.csv')
submission

Unnamed: 0,rId,rating
0,1,3.0
1,2,3.0
2,3,1.0
3,4,4.0
4,5,1.0
...,...,...
20161,20162,2.0
20162,20163,2.0
20163,20164,1.0
20164,20165,1.0


# 3. Latent Factor Models

### A. 피봇 테이블(Rating)

In [5]:
###---rating matrix---###
df = pd.pivot_table(train, 
                index = 'userId',
                columns = 'title',
                values = 'rating',
                )
df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,,,,
609,,,,,,,,,,,...,,,,,,,,,,


items = torch.LongTensor(train['movieId'])
users = torch.LongTensor(train['userId'])
ratings = torch.FloatTensor(train['rating'])

### B. 행렬 Q, P 생성

In [7]:
rows = len(df.index) 
cols = len(df.columns)

In [8]:
rank = 10
torch.manual_seed(1004)
Q = torch.randn(rows, rank, requires_grad=True)
P = torch.randn(cols, rank, requires_grad=True)

In [9]:
print(Q)
print(P)

tensor([[-1.6492, -0.6707, -0.6212,  ...,  1.2595,  2.0986, -0.7325],
        [-0.4736,  1.5456,  0.4469,  ...,  0.9243,  0.9278,  1.1756],
        [-0.0299,  1.3536, -0.0226,  ...,  0.6097,  1.4900,  0.7166],
        ...,
        [ 1.4390, -1.0777, -0.4058,  ..., -0.1690,  0.1073, -1.1088],
        [-1.0472, -0.4021, -0.1893,  ..., -0.1600,  1.2254,  0.2835],
        [-0.5060, -0.2514, -0.6829,  ...,  2.1894,  0.7369,  1.1160]],
       requires_grad=True)
tensor([[-0.1345, -0.1736, -1.4496,  ..., -0.8859, -1.2192, -0.5634],
        [ 0.7533, -1.1351,  1.7105,  ...,  1.2544,  0.5263,  0.1503],
        [-0.4316,  0.8055, -1.2533,  ..., -0.6397,  1.2673, -0.4306],
        ...,
        [ 0.3220,  1.1608,  1.0605,  ..., -1.0948, -0.0840,  0.4576],
        [-0.2656, -0.4379,  1.3833,  ..., -0.0790, -1.3909, -0.5975],
        [-0.4016, -1.2454, -0.4065,  ..., -0.8786,  0.6180, -0.2364]],
       requires_grad=True)


### C. 값이 있는 행렬 저장

In [10]:
## 값이 있는 행렬
value = []
for i in range(rows):
    for j in range(cols):
        if df.iloc[i, j] >= 0:
            # i : 행, j : 열, df.iloc : rating 값
            value.append((i, j, df.iloc[i, j]))

In [11]:
value

[(0, 45, 4.0),
 (0, 184, 4.0),
 (0, 227, 5.0),
 (0, 303, 5.0),
 (0, 372, 5.0),
 (0, 407, 5.0),
 (0, 520, 4.0),
 (0, 623, 5.0),
 (0, 688, 5.0),
 (0, 690, 4.0),
 (0, 727, 5.0),
 (0, 765, 5.0),
 (0, 774, 4.0),
 (0, 778, 3.0),
 (0, 850, 5.0),
 (0, 858, 4.0),
 (0, 878, 4.0),
 (0, 910, 4.0),
 (0, 963, 5.0),
 (0, 994, 5.0),
 (0, 1016, 5.0),
 (0, 1124, 5.0),
 (0, 1177, 5.0),
 (0, 1225, 4.0),
 (0, 1409, 5.0),
 (0, 1551, 5.0),
 (0, 1643, 5.0),
 (0, 1679, 4.0),
 (0, 1680, 3.0),
 (0, 1690, 5.0),
 (0, 1762, 4.0),
 (0, 1876, 5.0),
 (0, 1960, 4.0),
 (0, 2037, 4.0),
 (0, 2153, 5.0),
 (0, 2191, 4.0),
 (0, 2213, 5.0),
 (0, 2263, 5.0),
 (0, 2320, 5.0),
 (0, 2326, 4.0),
 (0, 2392, 5.0),
 (0, 2403, 5.0),
 (0, 2420, 5.0),
 (0, 2438, 4.0),
 (0, 2443, 4.0),
 (0, 2459, 5.0),
 (0, 2506, 3.0),
 (0, 2557, 3.0),
 (0, 2600, 5.0),
 (0, 2648, 5.0),
 (0, 2698, 5.0),
 (0, 2752, 4.0),
 (0, 2763, 5.0),
 (0, 2841, 4.0),
 (0, 2906, 4.0),
 (0, 2932, 4.0),
 (0, 2996, 3.0),
 (0, 3002, 5.0),
 (0, 3022, 5.0),
 (0, 3024, 5.0),
 

### D. Gradient descent

In [12]:
optim = torch.optim.SGD([Q, P], lr = 0.01)

epochs = 30

for epoch in range(epochs):
    for i, x, rating in value:
        predict = torch.sum(Q[i] * P[x])
        rating = torch.tensor(rating, dtype = torch.float32)

        # loss
        loss = F.mse_loss(predict, rating)

        optim.zero_grad()
        loss.backward()
        optim.step()

    if epoch % 5 == 0:
        print("epoch: {}, loss: {:.6f}" .format(epoch, loss.item()))

epoch: 0, loss: 5.261974
epoch: 5, loss: 0.491939
epoch: 10, loss: 0.701293
epoch: 15, loss: 0.598788
epoch: 20, loss: 0.547039
epoch: 25, loss: 0.529228


### E. 결과

In [111]:
y_hat = torch.matmul(Q, P.T)
y_hat = torch.abs(y_hat) # 절대값 적용 안하고 1.0에서 5.0 사이로 적용하는 거랑 결과는 차이 없음

y_hat[y_hat > 5.0]=5.0
y_hat[y_hat < 1.0]=1.0

y_hat = torch.round(y_hat, decimals = 1)
y_hat

tensor([[4.0000, 5.0000, 2.9000,  ..., 3.0000, 4.1000, 1.5000],
        [3.5000, 4.0000, 1.6000,  ..., 4.7000, 1.9000, 1.2000],
        [3.0000, 1.0000, 5.0000,  ..., 1.0000, 1.0000, 1.0000],
        ...,
        [3.2000, 5.0000, 3.3000,  ..., 1.0000, 2.7000, 1.0000],
        [2.7000, 3.6000, 2.1000,  ..., 2.1000, 1.8000, 1.4000],
        [3.9000, 5.0000, 3.8000,  ..., 2.0000, 3.1000, 1.0000]],
       grad_fn=<RoundBackward1>)

### 0.5 단위

In [115]:
# x.3 ~ x.7 -> x.5로 scaling
y_hat = ((y_hat+0.2)//0.5)*0.5
y_hat

# y_hat2 = torch.round(y_hat)
# y_hat = torch.where(y_hat%0.5!=0, y_hat2, y_hat)
# y_hat

tensor([[4.0000, 5.0000, 3.0000,  ..., 3.0000, 4.0000, 1.5000],
        [3.5000, 4.0000, 1.5000,  ..., 4.5000, 2.0000, 1.0000],
        [3.0000, 1.0000, 5.0000,  ..., 1.0000, 1.0000, 1.0000],
        ...,
        [3.0000, 5.0000, 3.5000,  ..., 1.0000, 2.5000, 1.0000],
        [2.5000, 3.5000, 2.0000,  ..., 2.0000, 2.0000, 1.5000],
        [4.0000, 5.0000, 4.0000,  ..., 2.0000, 3.0000, 1.0000]],
       grad_fn=<MulBackward0>)

In [116]:
latent_matrix = pd.DataFrame(y_hat.detach().numpy(),
                            columns = df.columns,
                            index = df.index)
latent_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,5.0,3.0,2.5,2.5,2.0,3.5,2.5,5.0,5.0,...,1.0,5.0,3.5,1.5,4.0,4.0,3.0,3.0,4.0,1.5
2,3.5,4.0,1.5,1.5,1.5,1.0,3.0,2.0,5.0,4.5,...,2.5,4.5,1.0,1.0,1.5,4.0,1.5,4.5,2.0,1.0
3,3.0,1.0,5.0,1.0,1.0,3.0,1.0,5.0,1.0,5.0,...,4.5,2.5,2.5,5.0,2.5,5.0,1.0,1.0,1.0,1.0
4,2.0,2.5,2.0,2.0,1.0,1.0,3.5,1.5,5.0,4.0,...,3.5,4.0,1.0,2.5,3.0,2.5,1.0,4.0,3.0,2.0
5,1.0,3.0,1.5,1.0,1.0,1.0,3.5,3.5,5.0,3.0,...,1.0,1.5,4.0,5.0,1.0,2.5,1.0,2.5,1.5,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.5,5.0,3.5,3.5,2.5,1.0,3.0,3.0,4.0,3.5,...,1.5,4.0,2.5,3.0,2.5,4.0,2.5,2.0,3.0,1.0
607,2.5,5.0,2.5,3.5,3.0,1.0,2.5,2.5,5.0,3.5,...,1.0,5.0,3.0,1.0,2.5,4.0,1.5,3.0,3.5,1.0
608,3.0,5.0,3.5,3.0,2.5,1.0,2.0,3.5,3.0,4.0,...,2.0,3.5,3.0,2.5,3.0,4.0,3.0,1.0,2.5,1.0
609,2.5,3.5,2.0,2.5,2.0,1.5,3.5,2.0,4.0,1.5,...,1.0,2.0,2.0,3.5,1.5,2.0,2.0,2.0,2.0,1.5


### 1.0 단위

```Python
y_hat = torch.matmul(Q, P.T)
y_hat = torch.abs(y_hat)

y_hat[y_hat > 5.0]=5.0
y_hat[y_hat < 1.0]=1.0

y_hat = torch.round(y_hat, decimals = 1)
y_hat = torch.round(y_hat, decimals = 0)

y_hat

latent_matrix = pd.DataFrame(y_hat.detach().numpy(),
                            columns = df.columns,
                            index = df.index)
latent_matrix
```

## 사용자의 평균 평점으로 대체

In [117]:
def user_mean_rating(user):
    user_mean = np.round(latent_matrix.iloc[user-1, :].mean(), 1)
    if user_mean%0.5 != 0:
        user_mean = np.round(user_mean, 0)
    return user_mean

In [118]:
pred_rating = []

for row in range(test.shape[0]):
    x = test.iloc[row, :]['userId']
    y = test.iloc[row, :]['title']
    
    if y in latent_matrix.columns:
        pred_rating.append(latent_matrix[y][x])
        
    else:  
        ## train 데이터에 없는 영화
        mean_rating = user_mean_rating(x)
        pred_rating.append(mean_rating)

In [119]:
submission_test = pd.DataFrame(columns = ['rId' , 'rating'])
submission_test['rId'] = test['rId']
submission_test['rating'] = pred_rating
submission_test.to_csv('submmision_test.csv', index = False)

In [120]:
submission_test

Unnamed: 0,rId,rating
0,1,3.0
1,2,4.0
2,3,3.5
3,4,4.0
4,5,4.0
...,...,...
20161,20162,4.5
20162,20163,3.0
20163,20164,3.0
20164,20165,4.0
