In [None]:
# 패키지 import
from pathlib import Path
import pandas as pd
import numpy as np

data = pd.read_csv('./ml-latest-small/ratings.csv')
# data.head()
print(data.shape)

# 먼저 데이터를 Train과 Validation데이터로 나눈다
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
# train = data.copy()
train = data[msk].copy()
val = data[~msk].copy()
test = pd.read_csv('./ml-25m/ratings.csv')

print(train.head())
print(train.shape)
print(val.head())
print(val.shape)
print(test.head())
print(test.shape)

# 다음은 Pandas의 컬럼을 범주형의 id로 인코드해주는 함수이다
def proc_col(col, train_col=None):
    """ Encodes a pandas column with continous ids. """
    # Unique한 row를 찾는다 즉 사용자 혹은 영화이다
    if train_col is not None:
        uniq = train_col.unique()
#         print(uniq)
    else:
        uniq = col.unique()
    # 사용자/영화를 인덱스와 매핑해준다
    name2idx = {o:i for i,o in enumerate(uniq)}
    # 그리고 그것을 포맷팅해서 리턴한다
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

# 다음은 실제로 데이터를 인코딩으로 만들어주는 함수이다
# 위에서 정의해준 proc_col을 사용한다
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids.
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        n2i,col,len_uniq = proc_col(df[col_name], train_col)
        #n2i는 트레인 기준으로 만드는거임
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

# encoding the train and validation data
print("Train Encoding")
df_train = encode_data(train)
print("Valid Encoding")
df_val = encode_data(val, train)
print("Test Encoding")
df_test = encode_data(test, train)
print(df_test)

num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())

import torch
import torch.nn as nn
import torch.nn.functional as F

def validation_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values)
    items = torch.LongTensor(df_val.movieId.values)
    ratings = torch.FloatTensor(df_val.rating.values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    print("y_hat :", y_hat, ",", y_hat.shape)
    loss = F.mse_loss(y_hat, ratings)
    print("validation loss {:.3f}".format(loss.item()))

def train_mf(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    print("Train,", "Learning Rate :", lr)
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values)
        items = torch.LongTensor(df_train.movieId.values)
        ratings = torch.FloatTensor(df_train.rating.values)
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i%5 == 4:
            print("epoch", i+1, ":", loss.item())
    print("Valid")
    validation_loss(model, unsqueeze)

def test(model, userId=1, size=5):
    print("Test")
    model.eval()
    global df_test
    
    #특정 사용자 추출 
    target_usr = df_test['userId'] == userId
    target_df = df_test[target_usr]
    users = torch.LongTensor(target_df.userId.values)
    items = torch.LongTensor(target_df.movieId.values)
    #ratings = torch.FloatTensor(df_test.rating.values)
    y_hat = model(users, items)

    #y_hat(tensor)를 predictions(pandas dataframe)으로 변환
    predictions = y_hat.detach().numpy()
    predictions = pd.DataFrame(predictions)
    predictions.columns = ["predictions"]

    #df_test(dataframe)에 predictions 붙이기 => new_df
    #df_test.index = [i for i in range(len(df_test))]
    df_test = df_test.reset_index(drop=True)
    new_df = pd.concat([df_test, predictions], axis=1)
    print()
    print("new_df")
    print(new_df)

    #movie 이름 붙이기
    movies = pd.read_csv('./ml-latest-small/movies.csv')
    new_df = pd.merge(new_df, movies, on='movieId')
    print()
    print("movie name")
    print(new_df)

    #predictions로 sort
    new_df = new_df.sort_values(by="predictions", ascending=False)
    print()
    print("sort")
    print(new_df)

    #size만큼만 남기기
    new_df = new_df[:size]
    print()
    print("size")
    print(new_df)

    #결과 출력
    result = []
    #new_df.index = [i for i in range(len(new_df))]
    new_df = new_df.reset_index(drop=True)
    print()
    print("Movie Recommendation for User", userId)
    for i in range(len(new_df)):
        result.append(new_df["title"][i])
        print(i+1, ":", result[i])

class NNCollabFiltering(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(NNCollabFiltering, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)

    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

# model = NNCollabFiltering(num_users, num_items, emb_size=100)
# train_mf(model, epochs=50, lr=0.01, wd=1e-6, unsqueeze=True)

model = NNCollabFiltering(num_users, num_items, emb_size=100)
train_mf(model, epochs=30, lr=0.05, wd=1e-6, unsqueeze=True)

# model = NNCollabFiltering(num_users, num_items, emb_size=100)
# train_mf(model, epochs=50, lr=0.001, wd=1e-6, unsqueeze=True)

test(model, 5, 10)