In [178]:
# Import PyTorch
import torch
# Get pandas for data manipulation
import pandas as pd
# Import nltk for text processing
import nltk
# Import os for file manipulation
import os
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split
# Import LabelEncoder from sklearn for encoding labels
from sklearn.preprocessing import LabelEncoder

In [179]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [180]:
# Load the data from Datasets/goodreads_data_onehot_genres.csv
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')
# Show the first few rows of the data
print(data.head())
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description  Fantasy  Adult  \
0  The unforgettable novel of a childhood in a sl...        0      1   
1  Harry Potter thinks he is an ordinary boy - un...        1      1   
2  Since its immediate success in 1813, Pride and...        0      0   
3  Discovered in the attic in which she spent the...        0      0   
4  A pilot stranded in the desert awakes one morn...        1      1   

   Historical  Roman  Romance  Young Adult  Historical Fiction  Science  ...  \
0           1      0        0            1                   1        0  ...   
1           0      0        0            1                   0        

In [181]:
# Create a dataframe with the plot column tokenized and lowercased
tokenizeddf = data.copy()
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [182]:
# Get the length of the longest plot
maxlen = tokenizeddf['Description'].apply(len).max()

# Get the set of all words in the plot column
wordset = set()
for plot in tokenizeddf['Description']:
    wordset.update(plot)
# Get the number of unique words
numwords = len(wordset)
print("Number of unique words: ", numwords)

# Create a dictionary that maps words to integers
word2int = {word: i for i, word in enumerate(wordset)}

# Function to convert a list of words to a list of integers
def words2ints(words):
    # Run through each word in the list
    ints = []
    for word in words:
        # If the word is in the dictionary, add the integer to the list
        if word in word2int:
            ints.append(word2int[word])
    return ints

Number of unique words:  80123


In [183]:
# Convert the Description column to a list of integers
tokenizeddf['Description'] = tokenizeddf['Description'].apply(words2ints)

# Pad the sequences to the maximum length
def pad_description(description):
    return description + [0] * (maxlen - len(description))

tokenizeddf['Description'] = tokenizeddf['Description'].apply(pad_description)

In [184]:
# Create a label encoder
le = LabelEncoder()

In [185]:
# Dataset class for genres
class GenreDataset(torch.utils.data.Dataset):
    def __init__(self, data, genre):
        self.plot = data["Description"].values
        self.genre = data[genre].values

    def __len__(self):
        return len(self.plot)
    
    def __getitem__(self, i):
        plot = self.plot[i]
        genre = self.genre[i]
        return torch.tensor(plot, dtype=torch.long), torch.tensor(genre, dtype=torch.float)
    
# Dataloader class for genres
class GenreDataLoader(torch.utils.data.DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

# RNN class for genres
# Create the RNN to classify the plots as action or not
class GenreRNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(GenreRNN, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_size)
        self.sigmoid = torch.nn.Sigmoid()
        self.hidden_dim = hidden_dim
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])        
        return out

In [197]:
# Function to train a model
def train_genre_rnn(genre, epochs, embed, hidden, output):
    genremodel = GenreRNN(numwords, embed, hidden, output)
    # Extract the relevant columns from the dataframe
    genredf = tokenizeddf[['Book', 'Description', genre]]
    # Encode the genre column
    genredf[genre] = le.fit_transform(genredf[genre])
    # Train test split the data
    # We won't use the test data in this function
    genretrain, genretest = train_test_split(genredf, test_size=0.2, random_state=42)
    # Create a GenreDataset object
    genretraindataset = GenreDataset(genretrain, genre)
    # Create a DataLoader
    genretrainloader = GenreDataLoader(genretraindataset, batch_size=32, shuffle=True)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(genremodel.parameters(), lr=0.01)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Send the model to the device
    genremodel.to(device)

    # Train the model
    num_epochs = epochs
    for epoch in range(num_epochs):
        epoch_loss = 0
        genremodel.train()
        for plots, genres in genretrainloader:
            # Send the data to the device
            plots = plots.to(device)
            genres = genres.to(device)
            outputs = genremodel(plots)
            loss = criterion(outputs, genres.long())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Return the model
    return genremodel

In [198]:
# Save a model
def save_model(model, genre):
    if not os.path.exists("models/rnn"):
        os.makedirs("models/rnn")
    torch.save(model.state_dict(), f"models/rnn/{genre}modelgoodreads.pth")

In [188]:
# Get all the genres
genres = data.columns[2:]
print(genres)

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [199]:
# Train a model for every genre
for genre in genres:
    print(f"Training model for {genre}")
    model = train_genre_rnn(genre, 10, 128, 128, 2)
    save_model(model, genre)

Training model for Fantasy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.45378658175468445
Epoch 2/10, Loss: 0.6822710633277893
Epoch 3/10, Loss: 0.7687479853630066
Epoch 4/10, Loss: 0.6806064248085022
Epoch 5/10, Loss: 0.4537822902202606
Epoch 6/10, Loss: 0.637323796749115
Epoch 7/10, Loss: 0.49676191806793213
Epoch 8/10, Loss: 0.37705138325691223
Epoch 9/10, Loss: 0.5785762071609497
Epoch 10/10, Loss: 0.7937762141227722
Training model for Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5647515058517456
Epoch 2/10, Loss: 0.4574686288833618
Epoch 3/10, Loss: 0.5970885157585144
Epoch 4/10, Loss: 0.5634920001029968
Epoch 5/10, Loss: 0.6033783555030823
Epoch 6/10, Loss: 0.46666255593299866
Epoch 7/10, Loss: 0.4614276587963104
Epoch 8/10, Loss: 0.9936614036560059
Epoch 9/10, Loss: 0.8872262835502625
Epoch 10/10, Loss: 0.5059335231781006
Training model for Historical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.29311272501945496
Epoch 2/10, Loss: 0.6253640055656433
Epoch 3/10, Loss: 0.6202725172042847
Epoch 4/10, Loss: 0.4998610019683838
Epoch 5/10, Loss: 0.2873847484588623
Epoch 6/10, Loss: 0.4518500864505768
Epoch 7/10, Loss: 0.7651438117027283
Epoch 8/10, Loss: 0.5673539638519287
Epoch 9/10, Loss: 0.9360573291778564
Epoch 10/10, Loss: 0.4909355640411377
Training model for Roman


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5149520039558411
Epoch 2/10, Loss: 0.49177050590515137
Epoch 3/10, Loss: 0.4586270749568939
Epoch 4/10, Loss: 0.71673983335495
Epoch 5/10, Loss: 0.47216737270355225
Epoch 6/10, Loss: 0.6425043940544128
Epoch 7/10, Loss: 0.7707116007804871
Epoch 8/10, Loss: 0.6036862134933472
Epoch 9/10, Loss: 0.41404202580451965
Epoch 10/10, Loss: 0.5841596722602844
Training model for Romance


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5151139497756958
Epoch 2/10, Loss: 0.8410816788673401
Epoch 3/10, Loss: 1.1159148216247559
Epoch 4/10, Loss: 0.6546376347541809
Epoch 5/10, Loss: 0.16569066047668457
Epoch 6/10, Loss: 0.9244983792304993
Epoch 7/10, Loss: 0.4564204216003418
Epoch 8/10, Loss: 0.690135657787323
Epoch 9/10, Loss: 0.34406742453575134
Epoch 10/10, Loss: 0.31044402718544006
Training model for Young Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4865054190158844
Epoch 2/10, Loss: 1.2243188619613647
Epoch 3/10, Loss: 0.8710188269615173
Epoch 4/10, Loss: 0.45497456192970276
Epoch 5/10, Loss: 0.8093757629394531
Epoch 6/10, Loss: 0.39752450585365295
Epoch 7/10, Loss: 0.2951678931713104
Epoch 8/10, Loss: 0.2881593108177185
Epoch 9/10, Loss: 0.4539876878261566
Epoch 10/10, Loss: 0.3578566610813141
Training model for Historical Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3319109380245209
Epoch 2/10, Loss: 0.45370957255363464
Epoch 3/10, Loss: 0.2910306453704834
Epoch 4/10, Loss: 0.6906499862670898
Epoch 5/10, Loss: 0.48542284965515137
Epoch 6/10, Loss: 0.30568259954452515
Epoch 7/10, Loss: 0.6504623889923096
Epoch 8/10, Loss: 0.28706321120262146
Epoch 9/10, Loss: 0.4705522060394287
Epoch 10/10, Loss: 0.49415531754493713
Training model for Science


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.6230500936508179
Epoch 2/10, Loss: 0.30243557691574097
Epoch 3/10, Loss: 0.6321523189544678
Epoch 4/10, Loss: 0.8322012424468994
Epoch 5/10, Loss: 0.2567015290260315
Epoch 6/10, Loss: 0.30686256289482117
Epoch 7/10, Loss: 0.4060761630535126
Epoch 8/10, Loss: 0.6118214726448059
Epoch 9/10, Loss: 0.27297279238700867
Epoch 10/10, Loss: 0.5573447346687317
Training model for Mystery


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.6353849172592163
Epoch 2/10, Loss: 0.4557631313800812
Epoch 3/10, Loss: 0.4510505497455597
Epoch 4/10, Loss: 0.7588467001914978
Epoch 5/10, Loss: 0.10329699516296387
Epoch 6/10, Loss: 0.5124954581260681
Epoch 7/10, Loss: 0.4626331627368927
Epoch 8/10, Loss: 0.6346685886383057
Epoch 9/10, Loss: 0.466503381729126
Epoch 10/10, Loss: 0.5868411064147949
Training model for Contemporary


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.7012696266174316
Epoch 2/10, Loss: 0.37231454253196716
Epoch 3/10, Loss: 0.3092804551124573
Epoch 4/10, Loss: 0.5628102421760559
Epoch 5/10, Loss: 0.4507284164428711
Epoch 6/10, Loss: 0.5957976579666138
Epoch 7/10, Loss: 0.5694186091423035
Epoch 8/10, Loss: 0.712331235408783
Epoch 9/10, Loss: 0.4772071838378906
Epoch 10/10, Loss: 0.28975188732147217
Training model for Thriller


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.45215126872062683
Epoch 2/10, Loss: 0.2872118055820465
Epoch 3/10, Loss: 0.32419779896736145
Epoch 4/10, Loss: 0.29216906428337097
Epoch 5/10, Loss: 0.636235237121582
Epoch 6/10, Loss: 0.29020634293556213
Epoch 7/10, Loss: 0.3002331852912903
Epoch 8/10, Loss: 0.6008893251419067
Epoch 9/10, Loss: 0.09735893458127975
Epoch 10/10, Loss: 0.11211501806974411
Training model for Science Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.6113893985748291
Epoch 2/10, Loss: 0.720554530620575
Epoch 3/10, Loss: 0.4515920877456665
Epoch 4/10, Loss: 0.14430464804172516
Epoch 5/10, Loss: 0.3136815130710602
Epoch 6/10, Loss: 0.6468546390533447
Epoch 7/10, Loss: 0.49475064873695374
Epoch 8/10, Loss: 0.09748243540525436
Epoch 9/10, Loss: 0.46927914023399353
Epoch 10/10, Loss: 0.3006616234779358
Training model for History


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.6457449197769165
Epoch 2/10, Loss: 0.029930224642157555
Epoch 3/10, Loss: 0.2908226549625397
Epoch 4/10, Loss: 0.08552569150924683
Epoch 5/10, Loss: 0.28829225897789
Epoch 6/10, Loss: 0.49617788195610046
Epoch 7/10, Loss: 0.30980584025382996
Epoch 8/10, Loss: 0.48320063948631287
Epoch 9/10, Loss: 0.6502717137336731
Epoch 10/10, Loss: 0.28869709372520447
Training model for Adventure


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.33752381801605225
Epoch 2/10, Loss: 0.45134496688842773
Epoch 3/10, Loss: 0.3512403964996338
Epoch 4/10, Loss: 0.29313015937805176
Epoch 5/10, Loss: 0.03971332684159279
Epoch 6/10, Loss: 0.45321670174598694
Epoch 7/10, Loss: 0.28710514307022095
Epoch 8/10, Loss: 0.37754830718040466
Epoch 9/10, Loss: 0.07466233521699905
Epoch 10/10, Loss: 0.45323488116264343
Training model for Philosophy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4525388479232788
Epoch 2/10, Loss: 0.6043381094932556
Epoch 3/10, Loss: 0.2877759039402008
Epoch 4/10, Loss: 0.29033657908439636
Epoch 5/10, Loss: 0.31320297718048096
Epoch 6/10, Loss: 0.4713670313358307
Epoch 7/10, Loss: 0.45058512687683105
Epoch 8/10, Loss: 0.33811965584754944
Epoch 9/10, Loss: 0.20176589488983154
Epoch 10/10, Loss: 0.43969643115997314
Training model for Biography


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3776189982891083
Epoch 2/10, Loss: 0.20049922168254852
Epoch 3/10, Loss: 1.2615922689437866
Epoch 4/10, Loss: 0.2874080240726471
Epoch 5/10, Loss: 0.31599584221839905
Epoch 6/10, Loss: 0.45093536376953125
Epoch 7/10, Loss: 0.31837815046310425
Epoch 8/10, Loss: 0.0475519597530365
Epoch 9/10, Loss: 0.2868709862232208
Epoch 10/10, Loss: 0.05088996887207031
Training model for Crime


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.1394815742969513
Epoch 2/10, Loss: 0.12641267478466034
Epoch 3/10, Loss: 0.576673686504364
Epoch 4/10, Loss: 0.5426138043403625
Epoch 5/10, Loss: 0.11421238631010056
Epoch 6/10, Loss: 0.28731775283813477
Epoch 7/10, Loss: 0.4521934986114502
Epoch 8/10, Loss: 0.1322181522846222
Epoch 9/10, Loss: 0.8322693705558777
Epoch 10/10, Loss: 0.012282331474125385
Training model for Self Help


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.25756528973579407
Epoch 2/10, Loss: 0.28952816128730774
Epoch 3/10, Loss: 0.3249213397502899
Epoch 4/10, Loss: 0.45600631833076477
Epoch 5/10, Loss: 0.2952418625354767
Epoch 6/10, Loss: 0.342092365026474
Epoch 7/10, Loss: 0.2871703803539276
Epoch 8/10, Loss: 0.6610851287841797
Epoch 9/10, Loss: 0.2869451344013214
Epoch 10/10, Loss: 0.08186168223619461
Training model for Psychology


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.11157060414552689
Epoch 2/10, Loss: 0.31084880232810974
Epoch 3/10, Loss: 0.026697715744376183
Epoch 4/10, Loss: 0.1246049702167511
Epoch 5/10, Loss: 0.07692432403564453
Epoch 6/10, Loss: 0.12246023863554001
Epoch 7/10, Loss: 0.3002302944660187
Epoch 8/10, Loss: 0.04857495054602623
Epoch 9/10, Loss: 0.6479313969612122
Epoch 10/10, Loss: 0.29431694746017456
Training model for Mystery Thriller


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.28687340021133423
Epoch 2/10, Loss: 0.1652226746082306
Epoch 3/10, Loss: 0.021851375699043274
Epoch 4/10, Loss: 0.501840353012085
Epoch 5/10, Loss: 0.27834364771842957
Epoch 6/10, Loss: 0.6295259594917297
Epoch 7/10, Loss: 0.28729286789894104
Epoch 8/10, Loss: 0.5694481134414673
Epoch 9/10, Loss: 0.021269207820296288
Epoch 10/10, Loss: 0.04786849021911621
Training model for Memoir


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.07158476859331131
Epoch 2/10, Loss: 0.043863292783498764
Epoch 3/10, Loss: 0.467647522687912
Epoch 4/10, Loss: 0.09708786755800247
Epoch 5/10, Loss: 0.45350512862205505
Epoch 6/10, Loss: 0.3099457919597626
Epoch 7/10, Loss: 0.7255845665931702
Epoch 8/10, Loss: 0.6482374668121338
Epoch 9/10, Loss: 1.0109448432922363
Epoch 10/10, Loss: 0.4365375339984894
Training model for Childrens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3049910068511963
Epoch 2/10, Loss: 0.08975166082382202
Epoch 3/10, Loss: 0.2994544804096222
Epoch 4/10, Loss: 0.04471611604094505
Epoch 5/10, Loss: 0.6987318992614746
Epoch 6/10, Loss: 0.2894434928894043
Epoch 7/10, Loss: 0.30082932114601135
Epoch 8/10, Loss: 0.07083164900541306
Epoch 9/10, Loss: 0.29733866453170776
Epoch 10/10, Loss: 0.17405082285404205
Training model for Humor


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3057411313056946
Epoch 2/10, Loss: 0.30704060196876526
Epoch 3/10, Loss: 0.5095469951629639
Epoch 4/10, Loss: 0.12641550600528717
Epoch 5/10, Loss: 0.10002268105745316
Epoch 6/10, Loss: 0.07427483052015305
Epoch 7/10, Loss: 0.29992905259132385
Epoch 8/10, Loss: 0.2993635833263397
Epoch 9/10, Loss: 1.0793366432189941
Epoch 10/10, Loss: 0.12552405893802643
Training model for Suspense


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3958897888660431
Epoch 2/10, Loss: 0.015355576761066914
Epoch 3/10, Loss: 0.2890794277191162
Epoch 4/10, Loss: 0.2905731499195099
Epoch 5/10, Loss: 0.3008432984352112
Epoch 6/10, Loss: 0.36601749062538147
Epoch 7/10, Loss: 0.31811654567718506
Epoch 8/10, Loss: 0.05802286043763161
Epoch 9/10, Loss: 0.3655276596546173
Epoch 10/10, Loss: 0.33874285221099854
Training model for Horror


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.299934059381485
Epoch 2/10, Loss: 0.7591667175292969
Epoch 3/10, Loss: 0.06789998710155487
Epoch 4/10, Loss: 0.017646221444010735
Epoch 5/10, Loss: 0.09602639824151993
Epoch 6/10, Loss: 0.09561198204755783
Epoch 7/10, Loss: 0.28840336203575134
Epoch 8/10, Loss: 0.08470696955919266
Epoch 9/10, Loss: 0.3415755033493042
Epoch 10/10, Loss: 0.13521063327789307


In [200]:
# Test the models
# Returns the accuracy, precision, recall, and F1 score of the model on the test data
def test_models(genres):
    # Create a dictionary to store the metrics
    results = {}
    # Run through each genre
    for genre in genres:
        # Load the model from the file (if it exists)
        if os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
            model = GenreRNN(numwords, 128, 128, 2)
            model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
            model.eval()
            # Extract the relevant columns from the dataframe
            genredf = tokenizeddf[['Book', 'Description', genre]]
            # Encode the genre column
            genredf[genre] = le.fit_transform(genredf[genre])
            # Train test split the data
            genretrain, genretest = train_test_split(genredf, test_size=0.3, random_state=42)
            genretestdataset = GenreDataset(genretest, genre)
            genretestloader = GenreDataLoader(genretestdataset, batch_size=32, shuffle=True)
            # Set the device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            # Send the model to the device
            model.to(device)
            # Initialize the metrics
            tp = 0
            tn = 0
            fp = 0
            fn = 0
            # Turn off gradients
            with torch.no_grad():
                for plots, genres in genretestloader:
                    plots = plots.to(device)
                    genres = genres.to(device)
                    # Get the outputs
                    outputs = model(plots)
                    _, preds = torch.max(outputs, 1)
                    # Increment the metrics
                    newtp = torch.sum((preds == 1) & (genres == 1)).item()
                    newtn = torch.sum((preds == 0) & (genres == 0)).item()
                    newfp = torch.sum((preds == 1) & (genres == 0)).item()
                    newfn = torch.sum((preds == 0) & (genres == 1)).item()
                    tp += newtp
                    tn += newtn
                    fp += newfp
                    fn += newfn
            # Calculate the metrics
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp) if tp + fp != 0 else 0
            recall = tp / (tp + fn) if tp + fn != 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0                    
            results[genre] = { "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1 }
    return results

In [191]:
# Write the results to a file
results = test_models(genres)
with open("goodreads_rnn_results.txt", "w") as f:
    for genre, metrics in results.items():
        f.write(f"{genre}:\n")
        for metric, value in metrics.items():
            f.write(f"{metric}: {value}\n")
        f.write("\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a

In [192]:
# Predict the probability that a plot is a certain genre
def score_genre(description,genre):
    # Get the tokenized version of the plot
    tokenplot = nltk.word_tokenize(description.lower())
    tokenplot = words2ints(tokenplot)
    tokenplot = pad_description(tokenplot)
    # Check if the model exists and load it
    if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
        print("Model does not exist")
        return None
    model = GenreRNN(numwords, 128, 128, 2)
    model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
    model.eval()
    # Convert the plot to a tensor
    tokenplot = torch.tensor(tokenplot, dtype=torch.long).unsqueeze(0)
    # Get the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Send the plot and the model to the device
    tokenplot = tokenplot.to(device)
    model.to(device)
    # Get the output of the model
    output = model(tokenplot)
    # Turn the output into a probability that the plot is the genre
    prob = torch.nn.functional.softmax(output, dim=1)[:, 1]
    return prob.item()

# Test the function
print(score_genre(data['Description'][0], 'Fantasy'))

0.24978426098823547


In [193]:
# Function to get the genre score for a plot
def predict_genres(description, genre):
    # Tokenize and pad the plot
    description = nltk.word_tokenize(description.lower())
    description = words2ints(description)
    description = pad_description(description)
    description = torch.tensor(description, dtype=torch.long).unsqueeze(0)
    # Check if the model exists
    if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
        print(f"Model for {genre} does not exist")
        return None
    #print(f"Predicting {genre}")
    # Load the model
    model = GenreRNN(numwords, 128, 128, 2)
    model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Send the model to the device
    model.to(device)
    # Send the plot to the device
    description = description.to(device)
    # Turn off gradients
    with torch.no_grad():
        # Get the output
        output = model(description)
        _, preds = torch.max(output, 1)
        return preds.item()

In [194]:
# Create a new dataframe with the Book and Description columns
genre_scores = data[['Book', 'Description']]
# Add a new column for the genre scores
for genre in genres:
    genre_scores[genre] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [195]:
# Predict genres for the first book
row = genre_scores.iloc[0]
for genre in genres:
    # Predict the genre score for the plot
    score = score_genre(row['Description'], genre)
    # Print the score and genre
    print(f"{genre}: {score}")
    # Overwrite the genre score in the dataframe
    genre_scores.at[0, genre] = score

row = genre_scores.iloc[1]
for genre in genres:
    # Predict the genre score for the plot
    score = score_genre(row['Description'], genre)
    # Print the score and genre
    print(f"{genre}: {score}")
    # Overwrite the genre score in the dataframe
    genre_scores.at[1, genre] = score

print(data['Description'][0])
print(data['Description'][1])

print(score_genre(data['Description'][0], 'Fantasy'))
print(score_genre(data['Description'][1], 'Fantasy'))

Fantasy: 0.24978426098823547
Adult: 0.2589132487773895


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Historical: 0.21866098046302795
Roman: 0.174833282828331
Romance: 0.1999729573726654
Young Adult: 0.21275590360164642


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Historical Fiction: 0.17600159347057343
Science: 0.1967347264289856


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Mystery: 0.1720680594444275
Contemporary: 0.1538202315568924


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Thriller: 0.1474837064743042
Science Fiction: 0.11311233043670654


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


History: 0.10035471618175507
Adventure: 0.08986809104681015


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Philosophy: 0.0732497125864029


  genre_scores.at[0, genre] = score


Biography: 0.09064341336488724
Crime: 0.0889410600066185


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Self Help: 0.09301909804344177


  genre_scores.at[0, genre] = score


Psychology: 0.08704997599124908
Mystery Thriller: 0.08307088166475296
Memoir: 0.07320615649223328


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Childrens: 0.06173611432313919


  genre_scores.at[0, genre] = score


Humor: 0.07380232959985733
Suspense: 0.07185681164264679
Horror: 0.05504041910171509


  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score
  genre_scores.at[0, genre] = score


Fantasy: 0.24978426098823547
Adult: 0.25891321897506714
Historical: 0.21866096556186676
Roman: 0.1748332679271698
Romance: 0.1999729573726654
Young Adult: 0.21275590360164642
Historical Fiction: 0.17600159347057343
Science: 0.1967347264289856
Mystery: 0.1720680594444275
Contemporary: 0.1538202315568924
Thriller: 0.1474837064743042
Science Fiction: 0.11311233043670654
History: 0.10035469383001328
Adventure: 0.08986809104681015
Philosophy: 0.0732497125864029
Biography: 0.09064339101314545
Crime: 0.0889410600066185
Self Help: 0.09301909804344177
Psychology: 0.08704997599124908
Mystery Thriller: 0.08307089656591415
Memoir: 0.07320615649223328
Childrens: 0.061736129224300385
Humor: 0.07380231469869614
Suspense: 0.07185681164264679
Horror: 0.05504041910171509
The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. "To Kill A Mockingbird" became both an instant bestseller and a critical success when it was first published in 1960. It went 

In [196]:
print(genre_scores.head())

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description   Fantasy     Adult  \
0  The unforgettable novel of a childhood in a sl...  0.249784  0.258913   
1  Harry Potter thinks he is an ordinary boy - un...  0.249784  0.258913   
2  Since its immediate success in 1813, Pride and...  0.000000  0.000000   
3  Discovered in the attic in which she spent the...  0.000000  0.000000   
4  A pilot stranded in the desert awakes one morn...  0.000000  0.000000   

   Historical     Roman   Romance  Young Adult  Historical Fiction   Science  \
0    0.218661  0.174833  0.199973     0.212756            0.176002  0.196735   
1    0.218661  0.174833  0.199973     0.212756

In [131]:
# Score each plot for each genre
for genre in genres:
    print(f"Scoring {genre}")
    genre_scores[genre] = genre_scores['Description'].apply(lambda x: score_genre(x, genre))

Scoring Fantasy
Scoring Adult
Scoring Historical
Scoring Roman
Scoring Romance
Scoring Young Adult
Scoring Historical Fiction
Scoring Science
Scoring Mystery
Scoring Contemporary
Scoring Thriller
Scoring Science Fiction
Scoring History
Scoring Adventure
Scoring Philosophy
Scoring Biography
Scoring Crime
Scoring Self Help
Scoring Psychology
Scoring Mystery Thriller
Scoring Memoir
Scoring Childrens
Scoring Humor
Scoring Suspense
Scoring Horror


In [132]:
# Save the dataframe to a CSV file
genre_scores.to_csv('Datasets/goodreads_rnn_genre_scores.csv', index=False)