In [1]:
# Import PyTorch
import torch
# Get pandas for data manipulation
import pandas as pd
# Import nltk for text processing
import nltk
# Import os for file manipulation
import os
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split
# Import LabelEncoder from sklearn for encoding labels
from sklearn.preprocessing import LabelEncoder

In [2]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Load the data from Datasets/goodreads_data_onehot_genres.csv
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')
# Show the first few rows of the data
print(data.head())
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description  Fantasy  Adult  \
0  The unforgettable novel of a childhood in a sl...        0      1   
1  Harry Potter thinks he is an ordinary boy - un...        1      1   
2  Since its immediate success in 1813, Pride and...        0      0   
3  Discovered in the attic in which she spent the...        0      0   
4  A pilot stranded in the desert awakes one morn...        1      1   

   Historical  Roman  Romance  Young Adult  Historical Fiction  Science  ...  \
0           1      0        0            1                   1        0  ...   
1           0      0        0            1                   0        

In [4]:
# Create a dataframe with the plot column tokenized and lowercased
tokenizeddf = data.copy()
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [5]:
# Get the length of the longest plot
maxlen = tokenizeddf['Description'].apply(len).max()

# Get the set of all words in the plot column
wordset = set()
for plot in tokenizeddf['Description']:
    wordset.update(plot)
# Get the number of unique words
numwords = len(wordset)
print("Number of unique words: ", numwords)

# Create a dictionary that maps words to integers
word2int = {word: i for i, word in enumerate(wordset)}

# Function to convert a list of words to a list of integers
def words2ints(words):
    # Run through each word in the list
    ints = []
    for word in words:
        # If the word is in the dictionary, add the integer to the list
        if word in word2int:
            ints.append(word2int[word])
    return ints

Number of unique words:  80123


In [6]:
# Convert the Description column to a list of integers
tokenizeddf['Description'] = tokenizeddf['Description'].apply(words2ints)

# Pad the sequences to the maximum length
def pad_description(description):
    return description + [0] * (maxlen - len(description))

tokenizeddf['Description'] = tokenizeddf['Description'].apply(pad_description)

In [7]:
# Get the list of genres
genres = data.columns[2:]
print(genres)
numgenres = len(genres)

# Create a label encoder
le = LabelEncoder()

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [8]:
# Dataset class to feed the data to the model
class RNNDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        # One-hot encode the labels
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
# Dataloader to load the data in batches
class RNNDataloader(torch.utils.data.DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

# RNN model
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.rnn = torch.nn.RNN(hidden_size, hidden_size)
        self.fc = torch.nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return torch.sigmoid(x)

In [9]:
# Create the model
model = RNN(numwords, 2048, numgenres).to(device)

# Loss function
criterion = torch.nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Convert the data to tensors
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: torch.tensor(x))

# Split the data into training and testing sets
traindata, testdata = train_test_split(tokenizeddf, test_size=0.2, random_state=42)

# Create the training and testing datasets
traindataset = RNNDataset(traindata['Description'].values, traindata[genres].values)
testdataset = RNNDataset(testdata['Description'].values, testdata[genres].values)
                          
# Create the training and testing dataloaders
traindataloader = RNNDataloader(traindataset, batch_size=32)
testdataloader = RNNDataloader(testdataset, batch_size=32)

# Create a dataloader for the full dataset
fulldataset = RNNDataset(tokenizeddf['Description'].values, tokenizeddf[genres].values)
fulldataloader = RNNDataloader(fulldataset, batch_size=32)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for i, (inputs, labels) in enumerate(traindataloader):
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Zero the gradients
        optimizer.zero_grad()
        # Get the outputs
        outputs = model(inputs)
        # Calculate the loss
        loss = criterion(outputs, labels.long())
        # Backpropagate
        loss.backward()
        # Optimize
        optimizer.step()
        if i % 100 == 0:
            print(f"Epoch {epoch}, Iteration {i}, Loss: {loss.item()}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in traindataloader:
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Get the outputs
        outputs = model(inputs)
        # Get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Get the total number of labels and the number of correct labels
        total += labels.size(0) * labels.size(1)
        correct += (predicted == labels).sum().item()
print(f"Accuracy: {correct / total}")

Epoch 0, Iteration 0, Loss: 7.4001264572143555
Epoch 0, Iteration 100, Loss: 6.4719672203063965
Epoch 0, Iteration 200, Loss: 6.462879657745361
Epoch 1, Iteration 0, Loss: 6.399365425109863
Epoch 1, Iteration 100, Loss: 6.406425952911377
Epoch 1, Iteration 200, Loss: 6.407114028930664
Epoch 2, Iteration 0, Loss: 6.545083999633789
Epoch 2, Iteration 100, Loss: 6.5429253578186035
Epoch 2, Iteration 200, Loss: 6.412768363952637
Epoch 3, Iteration 0, Loss: 6.412660598754883
Epoch 3, Iteration 100, Loss: 6.398614406585693
Epoch 3, Iteration 200, Loss: 6.420236587524414
Epoch 4, Iteration 0, Loss: 6.402560234069824
Epoch 4, Iteration 100, Loss: 6.465549945831299
Epoch 4, Iteration 200, Loss: 6.472153186798096
Accuracy: 0.8374115456238361


In [10]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in testdataloader:
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Get the outputs
        outputs = model(inputs)
        # Get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Get the total number of labels and the number of correct labels
        total += labels.size(0) * labels.size(1)
        correct += (predicted == labels).sum().item()
print(f"Accuracy: {correct / total}")

Accuracy: 0.8003227808814402


In [11]:
# Evaluate the accuracy for each genre
# Copy the book and description columns into a new dataframe
predictions = data[['Book', 'Description']].copy()

# Add blank columns for the genres
for genre in genres:
    predictions[genre] = None

# Get the predictions for each book
with torch.no_grad():
    for i, (inputs, labels) in enumerate(fulldataloader):
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Get the outputs
        outputs = model(inputs)
        # Get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Add the predictions to the dataframe
        for j, genre in enumerate(genres):
            predictions[genre].iloc[i * 32:(i + 1) * 32] = predicted[:, j].cpu().numpy()
# For any entry where the prediction is greater than 0.5, set it to 1, otherwise set it to 0
predictions[genres] = predictions[genres].applymap(lambda x: 1 if x > 0.5 else 0)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  predictions[genre].iloc[i * 32:(i + 1) * 32] = predicted[:, j].cpu().numpy()
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never wor

In [12]:
# Get the accuracy for each genre
accuracies = {}
for genre in genres:
    correct = (predictions[genre] == data[genre]).sum()
    total = len(data)
    accuracies[genre] = correct / total

print(accuracies)
print(accuracies.values())

{'Fantasy': 0.719056486654252, 'Adult': 0.7051520794537555, 'Historical': 0.7751707014276846, 'Roman': 0.7777777777777778, 'Romance': 0.7796399751707014, 'Young Adult': 0.7869646182495345, 'Historical Fiction': 0.7927995034140286, 'Science': 0.8058348851644941, 'Mystery': 0.8059590316573557, 'Contemporary': 0.8119180633147114, 'Thriller': 0.8356300434512725, 'Science Fiction': 0.8523898199875853, 'History': 0.8670391061452514, 'Adventure': 0.8793296089385475, 'Philosophy': 0.8797020484171322, 'Biography': 0.8718808193668529, 'Crime': 0.8835505896958411, 'Self Help': 0.8754810676598386, 'Psychology': 0.8811918063314711, 'Mystery Thriller': 0.8860335195530726, 'Memoir': 0.8901303538175046, 'Childrens': 0.891495965238982, 'Humor': 0.9006828057107387, 'Suspense': 0.9016759776536313, 'Horror': 0.9062693978895097}
dict_values([0.719056486654252, 0.7051520794537555, 0.7751707014276846, 0.7777777777777778, 0.7796399751707014, 0.7869646182495345, 0.7927995034140286, 0.8058348851644941, 0.805959

In [13]:
# Get accuracy, precision, recall, and F1 score for each genre
metrics = {}
for genre in genres:
    true_positives = ((predictions[genre] == 1) & (data[genre] == 1)).sum()
    false_positives = ((predictions[genre] == 1) & (data[genre] == 0)).sum()
    false_negatives = ((predictions[genre] == 0) & (data[genre] == 1)).sum()
    true_negatives = ((predictions[genre] == 0) & (data[genre] == 0)).sum()
    accuracy = (true_positives + true_negatives) / len(data)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * precision * recall / (precision + recall)
    metrics[genre] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

print(metrics)

{'Fantasy': {'Accuracy': 0.719056486654252, 'Precision': 0.23208191126279865, 'Recall': 0.032288698955365625, 'F1': 0.05669028761984161}, 'Adult': {'Accuracy': 0.7051520794537555, 'Precision': 0.26253687315634217, 'Recall': 0.04019873532068654, 'F1': 0.06972189580885234}, 'Historical': {'Accuracy': 0.7751707014276846, 'Precision': 0.18248175182481752, 'Recall': 0.030543677458766034, 'F1': 0.05232862375719518}, 'Roman': {'Accuracy': 0.7777777777777778, 'Precision': 0.17374517374517376, 'Recall': 0.027760641579272053, 'F1': 0.047872340425531915}, 'Romance': {'Accuracy': 0.7796399751707014, 'Precision': 0.1930379746835443, 'Recall': 0.038583175205566096, 'F1': 0.06431207169214549}, 'Young Adult': {'Accuracy': 0.7869646182495345, 'Precision': 0.1619718309859155, 'Recall': 0.030183727034120734, 'F1': 0.05088495575221239}, 'Historical Fiction': {'Accuracy': 0.7927995034140286, 'Precision': 0.18027210884353742, 'Recall': 0.03578663065496286, 'F1': 0.059718309859154925}, 'Science': {'Accuracy'

The below would be used to compare different books based on similarities in the hidden layer outputs; however, we ran out of time to implement this.

In [None]:
# Function to get the hidden layer outputs for a given book description
def get_hidden_outputs(description):
    # Tokenize the description
    description = nltk.word_tokenize(description.lower())
    # Convert the description to a list of integers
    description = words2ints(description)
    # Pad the description
    description = pad_description(description)
    # Convert the description to a tensor
    description = torch.tensor(description).unsqueeze(0)
    # Send the description to the device
    description = description.to(device)
    # Send the description through the model
    model.eval()
    with torch.no_grad():
        hidden_outputs = model.embedding(description)
        hidden_outputs, _ = model.rnn(hidden_outputs)
    return hidden_outputs