In [1]:
import numpy as np
import pandas as pd


In [2]:
import scipy.sparse as sps
import os
import torch
from torch import nn, cat, mean
#from sequentail import Sequentail, WideAndDeep



In [3]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error

In [4]:
top = os.getcwd()

In [5]:
# путь до папки с датасетом
for one, two, three in os.walk(top):
    one_ = one.split("\\")
    if 'ml-100k' in one:
        path = one

In [6]:
path_data = path + f'\\u.data'
path_info = path + f'\\u.info'
path_item = path + f'\\u.item'
path_user = path + f'\\u.user'

### 1 DATA LOADING

In [7]:
#Load the Ratings data
data = pd.read_csv(path_data, sep="\t", header=None)
data.columns = ['user id', 'item id', 'rating', 'timestamp']
data.head()

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
info = pd.read_csv(path_info, sep="\t", header=None)
#data.columns = ['user id', 'movie id', 'rating', 'timestamp']
info.head()

Unnamed: 0,0
0,943 users
1,1682 items
2,100000 ratings


In [9]:
#Load the User data
users = pd.read_csv(path_user, sep="|", encoding='latin-1', header=None)
users.columns = ['user id', 'age', 'gender', 'occupation', 'zip code']
users.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [10]:
#Load movie data
items = pd.read_csv(path_item, sep="|", encoding='latin-1', header=None)

-----------------------------------------------------------------------------

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
print(device)

cpu


-------------------------------------------------------

In [13]:
features = ['user id', 'item id']
target = 'rating'

----------------------------------------------------------------

### 2 FEATURE EMBEDDING

In [14]:
def get_embedding(data, name, size = 4):
    """calculate embadding tensor for data:pd.DataFrame and chosen name: 'str'
    size of embeddings vector = 4 for default
    """

    num_embeddings = data[name].unique().max()+1
    embed_size = size
    emb = nn.Embedding(num_embeddings, embed_size, padding_idx=0)
    tensor_intermediate = torch.from_numpy(data[name].values.reshape(-1,1))
    tenor_result = torch.empty((1,size))
    for tensor in tensor_intermediate:    
        tenor_result = torch.cat((tenor_result, emb(tensor)), dim =0)
    
    # удаляю маску с которой инициировал датасет
    tenor_result = tenor_result[1:]

    return tenor_result 

In [15]:
# определяем embedding для **item_id**
item_tensor = get_embedding(data, 'item id', size = 4)

In [16]:
# определяем embedding для **user_id**
user_tensor = get_embedding(data, 'user id', size = 4)

---------------------------------------------

### 3 MODEL

In [27]:
class RecSys_skillbox(nn.Module):
    def __init__(self, item_shape, user_shape):

        super().__init__()
        self.item_shape = item_shape
        self.user_shape = user_shape
        self.linear_relu_stack = nn.Sequential(
            nn.Linear((item_shape+user_shape), 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 1),
            #nn.Softmax()
            nn.ReLU()
        )

    def forward(self, x):             
        logits = self.linear_relu_stack(x)
        return logits

In [28]:
model = RecSys_skillbox(item_tensor.size()[1], user_tensor.size()[1]).to(device)
print(model)

RecSys_skillbox(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=8, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=1, bias=True)
    (5): ReLU()
  )
)


--------------------------------------------------

подготовка batch

In [29]:
target_train = torch.tensor(data['rating'].values, dtype = torch.float, requires_grad=True)

In [30]:

dataset = TensorDataset(torch.tensor(torch.cat((item_tensor, user_tensor), dim =1)), torch.tensor(target_train))
loader = DataLoader(dataset, batch_size=16, shuffle=False)


  dataset = TensorDataset(torch.tensor(torch.cat((item_tensor, user_tensor), dim =1)), torch.tensor(target_train))


In [31]:
EPOCHS = 10
#loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for t in range(EPOCHS):
    print(f'--- Epoch {t} is started --- ')
    model.train()
    for (train, target) in loader:

        pred_train = model(train)       
        loss_train = loss_fn(pred_train, target)        

        # Backpropagation 
        optimizer.zero_grad()           
        loss_train.backward(retain_graph=True)
        optimizer.step()      
    
    print(f" --- Epoch {t} is finished --- ")
    print(f"Train loss: {loss_train:>7f}")


--- Epoch 0 is started --- 
 --- Epoch 0 is finished --- 
Train loss: 1.137327
--- Epoch 1 is started --- 
 --- Epoch 1 is finished --- 
Train loss: 1.098299
--- Epoch 2 is started --- 
 --- Epoch 2 is finished --- 
Train loss: 1.096165
--- Epoch 3 is started --- 
 --- Epoch 3 is finished --- 
Train loss: 1.082699
--- Epoch 4 is started --- 
 --- Epoch 4 is finished --- 
Train loss: 1.086277
--- Epoch 5 is started --- 
 --- Epoch 5 is finished --- 
Train loss: 1.088688
--- Epoch 6 is started --- 
 --- Epoch 6 is finished --- 
Train loss: 1.087157
--- Epoch 7 is started --- 
 --- Epoch 7 is finished --- 
Train loss: 1.081084
--- Epoch 8 is started --- 
 --- Epoch 8 is finished --- 
Train loss: 1.089432
--- Epoch 9 is started --- 
 --- Epoch 9 is finished --- 
Train loss: 1.091389


------------------------------------------------

# RESULT

In [32]:
prediction = []
for (train, _) in loader:
    pred_train = model(train)       
    prediction.append(pred_train.detach().numpy())

In [33]:
prediction = np.concatenate(prediction)

In [45]:
df_prediction = pd.DataFrame(index = list(range(prediction.shape[0])))

In [48]:
df_prediction['predict'] = prediction

In [53]:
print("Метрика обучения МАЕ = {:.2f}".format(mean_absolute_error(data['rating'], df_prediction['predict'])))

Метрика обучения МАЕ = 0.90
