In [1]:
# Import PyTorch
import torch
# Get pandas for data manipulation
import pandas as pd
# Import nltk for text processing
import nltk
# Import os for file manipulation
import os
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split
# Import LabelEncoder from sklearn for encoding labels
from sklearn.preprocessing import LabelEncoder

In [2]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Load the data from Datasets/goodreads_data_onehot_genres.csv
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')
# Show the first few rows of the data
print(data.head())
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description  Fantasy  Adult  \
0  The unforgettable novel of a childhood in a sl...        0      1   
1  Harry Potter thinks he is an ordinary boy - un...        1      1   
2  Since its immediate success in 1813, Pride and...        0      0   
3  Discovered in the attic in which she spent the...        0      0   
4  A pilot stranded in the desert awakes one morn...        1      1   

   Historical  Roman  Romance  Young Adult  Historical Fiction  Science  ...  \
0           1      0        0            1                   1        0  ...   
1           0      0        0            1                   0        

In [4]:
# Create a dataframe with the plot column tokenized and lowercased
tokenizeddf = data.copy()
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [5]:
# Get the length of the longest plot
maxlen = tokenizeddf['Description'].apply(len).max()

# Get the set of all words in the plot column
wordset = set()
for plot in tokenizeddf['Description']:
    wordset.update(plot)
# Get the number of unique words
numwords = len(wordset)
print("Number of unique words: ", numwords)

# Create a dictionary that maps words to integers
word2int = {word: i for i, word in enumerate(wordset)}

# Function to convert a list of words to a list of integers
def words2ints(words):
    # Run through each word in the list
    ints = []
    for word in words:
        # If the word is in the dictionary, add the integer to the list
        if word in word2int:
            ints.append(word2int[word])
    return ints

Number of unique words:  80123


In [6]:
# Convert the Description column to a list of integers
tokenizeddf['Description'] = tokenizeddf['Description'].apply(words2ints)

# Pad the sequences to the maximum length
def pad_description(description):
    return description + [0] * (maxlen - len(description))

tokenizeddf['Description'] = tokenizeddf['Description'].apply(pad_description)

In [10]:
# Get the list of genres
genres = data.columns[2:]
print(genres)
numgenres = len(genres)

# Create a label encoder
le = LabelEncoder()

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [97]:
# Dataset class to feed the data to the model
class RNNDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        # One-hot encode the labels
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
# Dataloader to load the data in batches
class RNNDataloader(torch.utils.data.DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

# RNN model
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.rnn = torch.nn.RNN(hidden_size, hidden_size)
        self.fc = torch.nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return torch.sigmoid(x)

In [103]:
# Create the model
model = RNN(numwords, 2048, numgenres).to(device)

# Loss function
criterion = torch.nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Convert the data to tensors
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: torch.tensor(x))

# Split the data into training and testing sets
traindata, testdata = train_test_split(tokenizeddf, test_size=0.2, random_state=42)

# Create the training and testing datasets
traindataset = RNNDataset(traindata['Description'].values, traindata[genres].values)
testdataset = RNNDataset(testdata['Description'].values, testdata[genres].values)
                          
# Create the training and testing dataloaders
traindataloader = RNNDataloader(traindataset, batch_size=32)
testdataloader = RNNDataloader(testdataset, batch_size=32)

# Create a dataloader for the full dataset
fulldataset = RNNDataset(tokenizeddf['Description'].values, tokenizeddf[genres].values)
fulldataloader = RNNDataloader(fulldataset, batch_size=32)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for i, (inputs, labels) in enumerate(traindataloader):
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Zero the gradients
        optimizer.zero_grad()
        # Get the outputs
        outputs = model(inputs)
        # Calculate the loss
        loss = criterion(outputs, labels.long())
        # Backpropagate
        loss.backward()
        # Optimize
        optimizer.step()
        if i % 100 == 0:
            print(f"Epoch {epoch}, Iteration {i}, Loss: {loss.item()}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in traindataloader:
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Get the outputs
        outputs = model(inputs)
        # Get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Get the total number of labels and the number of correct labels
        total += labels.size(0) * labels.size(1)
        correct += (predicted == labels).sum().item()
print(f"Accuracy: {correct / total}")

  tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: torch.tensor(x))


Epoch 0, Iteration 0, Loss: 7.372584342956543
Epoch 0, Iteration 100, Loss: 6.7762956619262695
Epoch 0, Iteration 200, Loss: 6.434518814086914
Epoch 1, Iteration 0, Loss: 6.582366943359375
Epoch 1, Iteration 100, Loss: 6.5176520347595215
Epoch 1, Iteration 200, Loss: 6.408897876739502
Epoch 2, Iteration 0, Loss: 6.458353042602539
Epoch 2, Iteration 100, Loss: 6.436894416809082
Epoch 2, Iteration 200, Loss: 6.5322136878967285
Epoch 3, Iteration 0, Loss: 6.419294357299805
Epoch 3, Iteration 100, Loss: 6.415324687957764
Epoch 3, Iteration 200, Loss: 6.409303665161133
Epoch 4, Iteration 0, Loss: 6.499971866607666
Epoch 4, Iteration 100, Loss: 6.4240546226501465
Epoch 4, Iteration 200, Loss: 6.445446014404297
Accuracy: 0.836139044072005


In [104]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in testdataloader:
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Get the outputs
        outputs = model(inputs)
        # Get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Get the total number of labels and the number of correct labels
        total += labels.size(0) * labels.size(1)
        correct += (predicted == labels).sum().item()
print(f"Accuracy: {correct / total}")

Accuracy: 0.7919801365611422


In [106]:
# Evaluate the accuracy for each genre
# Copy the book and description columns into a new dataframe
predictions = data[['Book', 'Description']].copy()

# Add blank columns for the genres
for genre in genres:
    predictions[genre] = None

# Get the predictions for each book
with torch.no_grad():
    for i, (inputs, labels) in enumerate(fulldataloader):
        # Send the data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Get the outputs
        outputs = model(inputs)
        # Get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Add the predictions to the dataframe
        for j, genre in enumerate(genres):
            predictions[genre].iloc[i * 32:(i + 1) * 32] = predicted[:, j].cpu().numpy()
# For any entry where the prediction is greater than 0.5, set it to 1, otherwise set it to 0
predictions[genres] = predictions[genres].applymap(lambda x: 1 if x > 0.5 else 0)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  predictions[genre].iloc[i * 32:(i + 1) * 32] = predicted[:, j].cpu().numpy()
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never wor

In [109]:
# Get the accuracy for each genre
accuracies = {}
for genre in genres:
    correct = (predictions[genre] == data[genre]).sum()
    total = len(data)
    accuracies[genre] = correct / total

print(accuracies)
print(accuracies.values())

{'Fantasy': 0.7179391682184978, 'Adult': 0.7070142768466791, 'Historical': 0.7698324022346369, 'Roman': 0.7749224084419615, 'Romance': 0.7792675356921167, 'Young Adult': 0.7812538795779019, 'Historical Fiction': 0.7891992551210428, 'Science': 0.8009931719428927, 'Mystery': 0.7975170701427685, 'Contemporary': 0.8094351334574799, 'Thriller': 0.8245810055865922, 'Science Fiction': 0.8446927374301676, 'History': 0.8578522656734947, 'Adventure': 0.8666666666666667, 'Philosophy': 0.8725015518311607, 'Biography': 0.8674115456238362, 'Crime': 0.8769708255741775, 'Self Help': 0.8809435133457479, 'Psychology': 0.8816883923029174, 'Mystery Thriller': 0.8742396027312228, 'Memoir': 0.8759776536312849, 'Childrens': 0.88504034761018, 'Humor': 0.8917442582247052, 'Suspense': 0.8943513345747982, 'Horror': 0.8990689013035382}
dict_values([0.7179391682184978, 0.7070142768466791, 0.7698324022346369, 0.7749224084419615, 0.7792675356921167, 0.7812538795779019, 0.7891992551210428, 0.8009931719428927, 0.79751

In [110]:
# Get accuracy, precision, recall, and F1 score for each genre
metrics = {}
for genre in genres:
    true_positives = ((predictions[genre] == 1) & (data[genre] == 1)).sum()
    false_positives = ((predictions[genre] == 1) & (data[genre] == 0)).sum()
    false_negatives = ((predictions[genre] == 0) & (data[genre] == 1)).sum()
    true_negatives = ((predictions[genre] == 0) & (data[genre] == 0)).sum()
    accuracy = (true_positives + true_negatives) / len(data)
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * precision * recall / (precision + recall)
    metrics[genre] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

print(metrics)

{'Fantasy': {'Accuracy': 0.7179391682184978, 'Precision': 0.26420454545454547, 'Recall': 0.04415954415954416, 'F1': 0.07567127746135069}, 'Adult': {'Accuracy': 0.7070142768466791, 'Precision': 0.27607361963190186, 'Recall': 0.04065040650406504, 'F1': 0.07086614173228346}, 'Historical': {'Accuracy': 0.7698324022346369, 'Precision': 0.20273972602739726, 'Recall': 0.04520464263897373, 'F1': 0.07392607392607393}, 'Roman': {'Accuracy': 0.7749224084419615, 'Precision': 0.22413793103448276, 'Recall': 0.048118445404071564, 'F1': 0.07922803453529711}, 'Romance': {'Accuracy': 0.7792675356921167, 'Precision': 0.2144927536231884, 'Recall': 0.04680581910183428, 'F1': 0.07684319833852545}, 'Young Adult': {'Accuracy': 0.7812538795779019, 'Precision': 0.16, 'Recall': 0.03674540682414698, 'F1': 0.059765208110992535}, 'Historical Fiction': {'Accuracy': 0.7891992551210428, 'Precision': 0.18181818181818182, 'Recall': 0.04186360567184335, 'F1': 0.06805708013172337}, 'Science': {'Accuracy': 0.80099317194289

: 

In [None]:
# Function to get the hidden layer outputs for a given book description
def get_hidden_outputs(description):
    # Tokenize the description
    description = nltk.word_tokenize(description.lower())
    # Convert the description to a list of integers
    description = words2ints(description)
    # Pad the description
    description = pad_description(description)
    # Convert the description to a tensor
    description = torch.tensor(description).unsqueeze(0)
    # Send the description to the device
    description = description.to(device)
    # Send the description through the model
    model.eval()
    with torch.no_grad():
        hidden_outputs = model.embedding(description)
        hidden_outputs, _ = model.rnn(hidden_outputs)
    return hidden_outputs

Below this point is old code from the prior version of the sheet, being removed as it's made irrelevant

In [185]:
# Dataset class for genres
class GenreDataset(torch.utils.data.Dataset):
    def __init__(self, data, genre):
        self.plot = data["Description"].values
        self.genre = data[genre].values

    def __len__(self):
        return len(self.plot)
    
    def __getitem__(self, i):
        plot = self.plot[i]
        genre = self.genre[i]
        return torch.tensor(plot, dtype=torch.long), torch.tensor(genre, dtype=torch.float)
    
# Dataloader class for genres
class GenreDataLoader(torch.utils.data.DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

# RNN class for genres
# Create the RNN to classify the plots as action or not
class GenreRNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(GenreRNN, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_size)
        self.sigmoid = torch.nn.Sigmoid()
        self.hidden_dim = hidden_dim
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])        
        return out

In [197]:
# Function to train a model
def train_genre_rnn(genre, epochs, embed, hidden, output):
    genremodel = GenreRNN(numwords, embed, hidden, output)
    # Extract the relevant columns from the dataframe
    genredf = tokenizeddf[['Book', 'Description', genre]]
    # Encode the genre column
    genredf[genre] = le.fit_transform(genredf[genre])
    # Train test split the data
    # We won't use the test data in this function
    genretrain, genretest = train_test_split(genredf, test_size=0.2, random_state=42)
    # Create a GenreDataset object
    genretraindataset = GenreDataset(genretrain, genre)
    # Create a DataLoader
    genretrainloader = GenreDataLoader(genretraindataset, batch_size=32, shuffle=True)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(genremodel.parameters(), lr=0.01)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Send the model to the device
    genremodel.to(device)

    # Train the model
    num_epochs = epochs
    for epoch in range(num_epochs):
        epoch_loss = 0
        genremodel.train()
        for plots, genres in genretrainloader:
            # Send the data to the device
            plots = plots.to(device)
            genres = genres.to(device)
            outputs = genremodel(plots)
            loss = criterion(outputs, genres.long())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Return the model
    return genremodel

In [198]:
# Save a model
def save_model(model, genre):
    if not os.path.exists("models/rnn"):
        os.makedirs("models/rnn")
    torch.save(model.state_dict(), f"models/rnn/{genre}modelgoodreads.pth")

In [188]:
# Get all the genres
genres = data.columns[2:]
print(genres)

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [199]:
# Train a model for every genre
for genre in genres:
    print(f"Training model for {genre}")
    model = train_genre_rnn(genre, 10, 128, 128, 2)
    save_model(model, genre)

Training model for Fantasy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.45378658175468445
Epoch 2/10, Loss: 0.6822710633277893
Epoch 3/10, Loss: 0.7687479853630066
Epoch 4/10, Loss: 0.6806064248085022
Epoch 5/10, Loss: 0.4537822902202606
Epoch 6/10, Loss: 0.637323796749115
Epoch 7/10, Loss: 0.49676191806793213
Epoch 8/10, Loss: 0.37705138325691223
Epoch 9/10, Loss: 0.5785762071609497
Epoch 10/10, Loss: 0.7937762141227722
Training model for Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5647515058517456
Epoch 2/10, Loss: 0.4574686288833618
Epoch 3/10, Loss: 0.5970885157585144
Epoch 4/10, Loss: 0.5634920001029968
Epoch 5/10, Loss: 0.6033783555030823
Epoch 6/10, Loss: 0.46666255593299866
Epoch 7/10, Loss: 0.4614276587963104
Epoch 8/10, Loss: 0.9936614036560059
Epoch 9/10, Loss: 0.8872262835502625
Epoch 10/10, Loss: 0.5059335231781006
Training model for Historical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.29311272501945496
Epoch 2/10, Loss: 0.6253640055656433
Epoch 3/10, Loss: 0.6202725172042847
Epoch 4/10, Loss: 0.4998610019683838
Epoch 5/10, Loss: 0.2873847484588623
Epoch 6/10, Loss: 0.4518500864505768
Epoch 7/10, Loss: 0.7651438117027283
Epoch 8/10, Loss: 0.5673539638519287
Epoch 9/10, Loss: 0.9360573291778564
Epoch 10/10, Loss: 0.4909355640411377
Training model for Roman


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5149520039558411
Epoch 2/10, Loss: 0.49177050590515137
Epoch 3/10, Loss: 0.4586270749568939
Epoch 4/10, Loss: 0.71673983335495
Epoch 5/10, Loss: 0.47216737270355225
Epoch 6/10, Loss: 0.6425043940544128
Epoch 7/10, Loss: 0.7707116007804871
Epoch 8/10, Loss: 0.6036862134933472
Epoch 9/10, Loss: 0.41404202580451965
Epoch 10/10, Loss: 0.5841596722602844
Training model for Romance


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5151139497756958
Epoch 2/10, Loss: 0.8410816788673401
Epoch 3/10, Loss: 1.1159148216247559
Epoch 4/10, Loss: 0.6546376347541809


In [None]:
# Test the models
# Returns the accuracy, precision, recall, and F1 score of the model on the test data
def test_models(genres):
    # Create a dictionary to store the metrics
    results = {}
    # Run through each genre
    for genre in genres:
        # Load the model from the file (if it exists)
        if os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
            model = GenreRNN(numwords, 128, 128, 2)
            model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
            model.eval()
            # Extract the relevant columns from the dataframe
            genredf = tokenizeddf[['Book', 'Description', genre]]
            # Encode the genre column
            genredf[genre] = le.fit_transform(genredf[genre])
            # Train test split the data
            genretrain, genretest = train_test_split(genredf, test_size=0.3, random_state=42)
            genretestdataset = GenreDataset(genretest, genre)
            genretestloader = GenreDataLoader(genretestdataset, batch_size=32, shuffle=True)
            # Set the device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            # Send the model to the device
            model.to(device)
            # Initialize the metrics
            tp = 0
            tn = 0
            fp = 0
            fn = 0
            # Turn off gradients
            with torch.no_grad():
                for plots, genres in genretestloader:
                    plots = plots.to(device)
                    genres = genres.to(device)
                    # Get the outputs
                    outputs = model(plots)
                    _, preds = torch.max(outputs, 1)
                    # Increment the metrics
                    newtp = torch.sum((preds == 1) & (genres == 1)).item()
                    newtn = torch.sum((preds == 0) & (genres == 0)).item()
                    newfp = torch.sum((preds == 1) & (genres == 0)).item()
                    newfn = torch.sum((preds == 0) & (genres == 1)).item()
                    tp += newtp
                    tn += newtn
                    fp += newfp
                    fn += newfn
            # Calculate the metrics
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp) if tp + fp != 0 else 0
            recall = tp / (tp + fn) if tp + fn != 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0                    
            results[genre] = { "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1 }
    return results

In [191]:
# Write the results to a file
results = test_models(genres)
with open("goodreads_rnn_results.txt", "w") as f:
    for genre, metrics in results.items():
        f.write(f"{genre}:\n")
        for metric, value in metrics.items():
            f.write(f"{metric}: {value}\n")
        f.write("\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a

In [192]:
# Predict the probability that a plot is a certain genre
def score_genre(description,genre):
    # Get the tokenized version of the plot
    tokenplot = nltk.word_tokenize(description.lower())
    tokenplot = words2ints(tokenplot)
    tokenplot = pad_description(tokenplot)
    # Check if the model exists and load it
    if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
        print("Model does not exist")
        return None
    model = GenreRNN(numwords, 128, 128, 2)
    model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
    model.eval()
    # Convert the plot to a tensor
    tokenplot = torch.tensor(tokenplot, dtype=torch.long).unsqueeze(0)
    # Get the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Send the plot and the model to the device
    tokenplot = tokenplot.to(device)
    model.to(device)
    # Get the output of the model
    output = model(tokenplot)
    # Turn the output into a probability that the plot is the genre
    prob = torch.nn.functional.softmax(output, dim=1)[:, 1]
    return prob.item()

# Test the function
print(score_genre(data['Description'][0], 'Fantasy'))

0.24978426098823547


In [193]:
# Function to get the genre score for a plot
def predict_genres(description, genre):
    # Tokenize and pad the plot
    description = nltk.word_tokenize(description.lower())
    description = words2ints(description)
    description = pad_description(description)
    description = torch.tensor(description, dtype=torch.long).unsqueeze(0)
    # Check if the model exists
    if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
        print(f"Model for {genre} does not exist")
        return None
    #print(f"Predicting {genre}")
    # Load the model
    model = GenreRNN(numwords, 128, 128, 2)
    model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Send the model to the device
    model.to(device)
    # Send the plot to the device
    description = description.to(device)
    # Turn off gradients
    with torch.no_grad():
        # Get the output
        output = model(description)
        _, preds = torch.max(output, 1)
        return preds.item()

In [194]:
# Create a new dataframe with the Book and Description columns
genre_scores = data[['Book', 'Description']]
# Add a new column for the genre scores
for genre in genres:
    genre_scores[genre] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [195]:
# Predict genres for the first book
row = genre_scores.iloc[0]
for genre in genres:
    # Predict the genre score for the plot
    score = score_genre(row['Description'], genre)
    # Print the score and genre
    print(f"{genre}: {score}")
    # Overwrite the genre score in the dataframe
    genre_scores.at[0, genre] = score

row = genre_scores.iloc[1]
for genre in genres:
    # Predict the genre score for the plot
    score = score_genre(row['Description'], genre)
    # Print the score and genre
    print(f"{genre}: {score}")
    # Overwrite the genre score in the dataframe
    genre_scores.at[1, genre] = score

print(data['Description'][0])
print(data['Description'][1])

print(score_genre(data['Description'][0], 'Fantasy'))
print(score_genre(data['Description'][1], 'Fantasy'))

Fantasy: 0.24978426098823547
Adult: 0.2589132487773895


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Historical: 0.21866098046302795
Roman: 0.174833282828331
Romance: 0.1999729573726654
Young Adult: 0.21275590360164642


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Historical Fiction: 0.17600159347057343
Science: 0.1967347264289856


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Mystery: 0.1720680594444275
Contemporary: 0.1538202315568924


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Thriller: 0.1474837064743042
Science Fiction: 0.11311233043670654


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


History: 0.10035471618175507
Adventure: 0.08986809104681015


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Philosophy: 0.0732497125864029


  genre_scores.at[0, genre] = score


Biography: 0.09064341336488724
Crime: 0.0889410600066185


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Self Help: 0.09301909804344177


  genre_scores.at[0, genre] = score


Psychology: 0.08704997599124908
Mystery Thriller: 0.08307088166475296
Memoir: 0.07320615649223328


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Childrens: 0.06173611432313919


  genre_scores.at[0, genre] = score


Humor: 0.07380232959985733
Suspense: 0.07185681164264679
Horror: 0.05504041910171509


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Fantasy: 0.24978426098823547
Adult: 0.25891321897506714
Historical: 0.21866096556186676
Roman: 0.1748332679271698
Romance: 0.1999729573726654
Young Adult: 0.21275590360164642
Historical Fiction: 0.17600159347057343
Science: 0.1967347264289856
Mystery: 0.1720680594444275
Contemporary: 0.1538202315568924
Thriller: 0.1474837064743042
Science Fiction: 0.11311233043670654
History: 0.10035469383001328
Adventure: 0.08986809104681015
Philosophy: 0.0732497125864029
Biography: 0.09064339101314545
Crime: 0.0889410600066185
Self Help: 0.09301909804344177
Psychology: 0.08704997599124908
Mystery Thriller: 0.08307089656591415
Memoir: 0.07320615649223328
Childrens: 0.061736129224300385
Humor: 0.07380231469869614
Suspense: 0.07185681164264679
Horror: 0.05504041910171509
The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went 

In [196]:
print(genre_scores.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description   Fantasy     Adult  \
0  The unforgettable novel of a childhood in a sl...  0.249784  0.258913   
1  Harry Potter thinks he is an ordinary boy - un...  0.249784  0.258913   
2  Since its immediate success in 1813, Pride and...  0.000000  0.000000   
3  Discovered in the attic in which she spent the...  0.000000  0.000000   
4  A pilot stranded in the desert awakes one morn...  0.000000  0.000000   

   Historical     Roman   Romance  Young Adult  Historical Fiction   Science  \
0    0.218661  0.174833  0.199973     0.212756            0.176002  0.196735   
1    0.218661  0.174833  0.199973     0.212756

In [131]:
# Score each plot for each genre
for genre in genres:
    print(f"Scoring {genre}")
    genre_scores[genre] = genre_scores['Description'].apply(lambda x: score_genre(x, genre))

Scoring Fantasy
Scoring Adult
Scoring Historical
Scoring Roman
Scoring Romance
Scoring Young Adult
Scoring Historical Fiction
Scoring Science
Scoring Mystery
Scoring Contemporary
Scoring Thriller
Scoring Science Fiction
Scoring History
Scoring Adventure
Scoring Philosophy
Scoring Biography
Scoring Crime
Scoring Self Help
Scoring Psychology
Scoring Mystery Thriller
Scoring Memoir
Scoring Childrens
Scoring Humor
Scoring Suspense
Scoring Horror


In [132]:
# Save the dataframe to a CSV file
genre_scores.to_csv('Datasets/goodreads_rnn_genre_scores.csv', index=False)