# Data exploration

This notebook serves as a preliminary exploration into the dataset so that a good network architecture can be found.

In [1]:
import numpy as np
import pandas as pd
import glob

In [2]:
##Read in the data
book_rating = pd.DataFrame()
for file in glob.glob("../data/raw/book*.csv"):
    df = pd.read_csv(file)
    # discard empty
    if book_rating.empty:
        book_rating = df
    else:
        df = pd.concat([book_rating, df], ignore_index=True)


        
user_rating_temp = pd.DataFrame()
for file in glob.glob("../data/raw/user_rating*.csv"):
    df = pd.read_csv(file)
    if user_rating_temp.empty:
        user_rating_temp = df
    else:
        df = pd.concat([user_rating_temp, df], ignore_index=True)

In [4]:
#Do not run this more than once
book_map = user_rating_temp[['Name']]
book_map.drop_duplicates(subset=['Name'],keep='first',inplace=True)
book_map['Book_Id']=book_map.index.values
user_rating_temp = pd.merge(user_rating_temp,book_map, on='Name', how='left')
user_rating = user_rating_temp[user_rating_temp['Name']!='Rating'] ##Dropping users who have not rated any books
user_rating.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_map.drop_duplicates(subset=['Name'],keep='first',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_map['Book_Id']=book_map.index.values


Unnamed: 0,ID,Name,Rating,Book_Id
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing,0
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing,1
2,1,Siddhartha,it was amazing,2
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it,3
4,1,"Ready Player One (Ready Player One, #1)",really liked it,4


## Converting ratings to numerical values using .map()

In [5]:
user_rating['Rating'].unique()

array(['it was amazing', 'really liked it', 'liked it', 'did not like it',
       'it was ok'], dtype=object)

In [6]:
rating_mapping = {'it was amazing': 5, 'really liked it': 4, 'liked it': 3, 'it was ok': 2, 'did not like it': 1}
user_rating['Rating'] = user_rating['Rating'].map(rating_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_rating['Rating'] = user_rating['Rating'].map(rating_mapping)


In [7]:
user_rating.head()

Unnamed: 0,ID,Name,Rating,Book_Id
0,1,Agile Web Development with Rails: A Pragmatic ...,5,0
1,1,The Restaurant at the End of the Universe (Hit...,5,1
2,1,Siddhartha,5,2
3,1,The Clock of the Long Now: Time and Responsibi...,4,3
4,1,"Ready Player One (Ready Player One, #1)",4,4


## Model

In [5]:
n_books = user_rating['Book_Id'].unique().sum()

In [6]:
n_users = user_rating['ID'].unique().sum()

In [7]:
import torch
import torch.nn as nn

class RecommenderNet(nn.Module):
    def __init__(self, n_users, n_books, n_factors):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, n_factors)
        self.book_emb = nn.Embedding(n_books, n_factors)
        self.drop = nn.Dropout(0.05)
        self.fc = nn.Linear(n_factors*2, 1)
        
    def forward(self, x):
        users = self.user_emb(x[:,0])
        books = self.book_emb(x[:,1])
        x = torch.cat([users, books], dim=1)
        x = self.drop(x)
        x = self.fc(x)
        return x

In [42]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn import MSELoss

class RatingDataset(Dataset):
    def __init__(self, user_rating):
        self.user_rating = user_rating

    def __len__(self):
        return len(self.user_rating)

    def __getitem__(self, idx):
        user = torch.tensor(self.user_rating.iloc[idx]['ID'])
        book = torch.tensor(self.user_rating.iloc[idx]['Book_Id'])
        rating = torch.tensor(self.user_rating.iloc[idx]['Rating'])
        return user, book, rating
        
        
# previous learning rate was 0.01
def train(model, user_rating, epochs=5, lr=0.1, batch_size=32):
    # Create a DataLoader from the DataFrame
    dataset = RatingDataset(user_rating)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Use mean squared error loss
    criterion = MSELoss()

    # Use Adam optimizer
    optimizer = Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        for user, book, rating in dataloader:
            
            # Convert data to tensors
            user = torch.tensor(user.values).long()
            book = torch.tensor(book.values).long()
            rating = torch.tensor(rating.values).float()

            # Forward pass
            outputs = model(user, book)
            loss = criterion(outputs, rating)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')


In [17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Previous value n_factors=50
model = RecommenderNet(n_users, n_books, n_factors=2).to(device)

In [39]:
RatingDataset(user_rating)

<__main__.RatingDataset at 0x2ec31cea1d0>

In [41]:
train(model, user_rating)

  return torch.tensor(self.user_rating.iloc[idx])


TypeError: new(): invalid data type 'str'

In [50]:
torch.cuda.is_available()

False