In [57]:
# Import PyTorch
import torch
# Get pandas for data manipulation
import pandas as pd
# Import nltk for text processing
import nltk
# Import os for file manipulation
import os
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split
# Import LabelEncoder from sklearn for encoding labels
from sklearn.preprocessing import LabelEncoder

In [58]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [59]:
# Load the data from Datasets/goodreads_data_onehot_genres.csv
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')
# Show the first few rows of the data
print(data.head())
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                        Animal Farm   

                                         Description  Fiction  Nonfiction  \
0  The unforgettable novel of a childhood in a sl...        1           0   
1  Harry Potter thinks he is an ordinary boy - un...        1           0   
2  Since its immediate success in 1813, Pride and...        1           0   
3  Discovered in the attic in which she spent the...        0           1   
4  Librarian's note: There is an Alternate Cover ...        1           0   

   Fantasy  Adult  Classics  Historical  Roman  Literature  ...  Audiobook  \
0        0      1         1           1      0           1  ...          0   
1        1      1         1           0     

In [60]:
# Create a dataframe with the plot column tokenized and lowercased
tokenizeddf = data.copy()
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [61]:
# Get the length of the longest plot
maxlen = tokenizeddf['Description'].apply(len).max()

# Get the set of all words in the plot column
wordset = set()
for plot in tokenizeddf['Description']:
    wordset.update(plot)
# Get the number of unique words
numwords = len(wordset)
print("Number of unique words: ", numwords)

# Create a dictionary that maps words to integers
word2int = {word: i for i, word in enumerate(wordset)}

# Function to convert a list of words to a list of integers
def words2ints(words):
    # Run through each word in the list
    ints = []
    for word in words:
        # If the word is in the dictionary, add the integer to the list
        if word in word2int:
            ints.append(word2int[word])
    return ints

Number of unique words:  83267


In [62]:
# Convert the Description column to a list of integers
tokenizeddf['Description'] = tokenizeddf['Description'].apply(words2ints)

# Pad the sequences to the maximum length
def pad_description(description):
    return description + [0] * (maxlen - len(description))

tokenizeddf['Description'] = tokenizeddf['Description'].apply(pad_description)

In [63]:
# Create a label encoder
le = LabelEncoder()

In [64]:
# Dataset class for genres
class GenreDataset(torch.utils.data.Dataset):
    def __init__(self, data, genre):
        self.plot = data["Description"].values
        self.genre = data[genre].values

    def __len__(self):
        return len(self.plot)
    
    def __getitem__(self, i):
        plot = self.plot[i]
        genre = self.genre[i]
        return torch.tensor(plot, dtype=torch.long), torch.tensor(genre, dtype=torch.float)
    
# Dataloader class for genres
class GenreDataLoader(torch.utils.data.DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

# RNN class for genres
# Create the RNN to classify the plots as action or not
class GenreRNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(GenreRNN, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_size)
        self.sigmoid = torch.nn.Sigmoid()
        self.hidden_dim = hidden_dim
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])        
        return out

In [65]:
# Function to train a model
def train_genre_rnn(genre, epochs, embed, hidden, output):
    genremodel = GenreRNN(numwords, embed, hidden, output)
    # Extract the relevant columns from the dataframe
    genredf = tokenizeddf[['Book', 'Description', genre]]
    # Encode the genre column
    genredf[genre] = le.fit_transform(genredf[genre])
    # Train test split the data
    # We won't use the test data in this function
    genretrain, genretest = train_test_split(genredf, test_size=0.2, random_state=42)
    # Create a GenreDataset object
    genretraindataset = GenreDataset(genretrain, genre)
    # Create a DataLoader
    genretrainloader = GenreDataLoader(genretraindataset, batch_size=32, shuffle=True)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(genremodel.parameters(), lr=0.001)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Send the model to the device
    genremodel.to(device)

    # Train the model
    num_epochs = epochs
    for epoch in range(num_epochs):
        epoch_loss = 0
        genremodel.train()
        for plots, genres in genretrainloader:
            # Send the data to the device
            plots = plots.to(device)
            genres = genres.to(device)
            outputs = genremodel(plots)
            loss = criterion(outputs, genres.long())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Return the model
    return genremodel

In [66]:
# Save a model
def save_model(model, genre):
    if not os.path.exists("models/rnn"):
        os.makedirs("models/rnn")
    torch.save(model.state_dict(), f"models/rnn/{genre}modelgoodreads.pth")

In [67]:
# Get all the genres
genres = data.columns[2:]
print(genres)

Index(['Fiction', 'Nonfiction', 'Fantasy', 'Adult', 'Classics', 'Historical',
       'Roman', 'Literature', 'Romance', 'Young Adult', 'Historical Fiction',
       'Science', 'Mystery', 'Contemporary', 'Novels', 'Audiobook', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology'],
      dtype='object')


In [68]:
# Train a model for every genre
for genre in genres:
    print(f"Training model for {genre}")
    model = train_genre_rnn(genre, 10, 128, 128, 2)
    save_model(model, genre)

Training model for Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5811673998832703
Epoch 2/10, Loss: 0.6149108409881592
Epoch 3/10, Loss: 0.7427389025688171
Epoch 4/10, Loss: 0.7472535967826843
Epoch 5/10, Loss: 0.5904498100280762
Epoch 6/10, Loss: 0.6463415026664734
Epoch 7/10, Loss: 0.711590051651001
Epoch 8/10, Loss: 0.556352436542511
Epoch 9/10, Loss: 0.6196823120117188
Epoch 10/10, Loss: 0.5471309423446655
Training model for Nonfiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.7153573036193848
Epoch 2/10, Loss: 0.523811936378479
Epoch 3/10, Loss: 0.7292189598083496
Epoch 4/10, Loss: 0.6164149045944214
Epoch 5/10, Loss: 0.6169933080673218
Epoch 6/10, Loss: 0.5301514267921448
Epoch 7/10, Loss: 0.525027871131897
Epoch 8/10, Loss: 0.473521888256073
Epoch 9/10, Loss: 0.6822282671928406
Epoch 10/10, Loss: 0.5750346183776855
Training model for Fantasy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5319236516952515
Epoch 2/10, Loss: 0.5881352424621582
Epoch 3/10, Loss: 0.5247480273246765
Epoch 4/10, Loss: 0.5376967787742615
Epoch 5/10, Loss: 0.343781441450119
Epoch 6/10, Loss: 0.5786932706832886
Epoch 7/10, Loss: 0.5433744788169861
Epoch 8/10, Loss: 0.6641574501991272
Epoch 9/10, Loss: 0.6203317046165466
Epoch 10/10, Loss: 0.45549219846725464
Training model for Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.46431484818458557
Epoch 2/10, Loss: 0.6236187219619751
Epoch 3/10, Loss: 0.6964054703712463
Epoch 4/10, Loss: 0.6973980069160461
Epoch 5/10, Loss: 0.6151685118675232
Epoch 6/10, Loss: 0.42815664410591125
Epoch 7/10, Loss: 0.6151887774467468
Epoch 8/10, Loss: 0.800076425075531
Epoch 9/10, Loss: 0.5862473845481873
Epoch 10/10, Loss: 0.5512269735336304
Training model for Classics


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.7129296064376831
Epoch 2/10, Loss: 0.5739978551864624
Epoch 3/10, Loss: 0.4724239408969879
Epoch 4/10, Loss: 0.7863049507141113
Epoch 5/10, Loss: 0.6604352593421936
Epoch 6/10, Loss: 0.7267174124717712
Epoch 7/10, Loss: 0.5830719470977783
Epoch 8/10, Loss: 0.4899440109729767
Epoch 9/10, Loss: 0.6431561708450317
Epoch 10/10, Loss: 0.5260992646217346
Training model for Historical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4638904631137848
Epoch 2/10, Loss: 0.34241464734077454
Epoch 3/10, Loss: 0.5289293527603149
Epoch 4/10, Loss: 0.5749608278274536
Epoch 5/10, Loss: 0.650684654712677
Epoch 6/10, Loss: 0.46391379833221436
Epoch 7/10, Loss: 0.28295618295669556
Epoch 8/10, Loss: 0.5413981676101685
Epoch 9/10, Loss: 0.4637724757194519
Epoch 10/10, Loss: 0.28448715806007385
Training model for Roman


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.40064236521720886
Epoch 2/10, Loss: 0.5372135043144226
Epoch 3/10, Loss: 0.5285096168518066
Epoch 4/10, Loss: 0.5922372341156006
Epoch 5/10, Loss: 0.4234701097011566
Epoch 6/10, Loss: 0.42577213048934937
Epoch 7/10, Loss: 0.4622529447078705
Epoch 8/10, Loss: 0.5275209546089172
Epoch 9/10, Loss: 0.62665855884552
Epoch 10/10, Loss: 0.6846196055412292
Training model for Literature


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5990660190582275
Epoch 2/10, Loss: 0.4038030505180359
Epoch 3/10, Loss: 0.46740835905075073
Epoch 4/10, Loss: 0.7288740277290344
Epoch 5/10, Loss: 0.6962839365005493
Epoch 6/10, Loss: 0.4631202816963196
Epoch 7/10, Loss: 0.46487483382225037
Epoch 8/10, Loss: 0.6561890244483948
Epoch 9/10, Loss: 0.7732166647911072
Epoch 10/10, Loss: 0.6746751666069031
Training model for Romance


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.46986284852027893
Epoch 2/10, Loss: 0.4745616614818573
Epoch 3/10, Loss: 0.4621402621269226
Epoch 4/10, Loss: 0.5822260975837708
Epoch 5/10, Loss: 0.4243162274360657
Epoch 6/10, Loss: 0.34603410959243774
Epoch 7/10, Loss: 0.531764805316925
Epoch 8/10, Loss: 0.46253588795661926
Epoch 9/10, Loss: 0.5256127119064331
Epoch 10/10, Loss: 0.7046588659286499
Training model for Young Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3941115140914917
Epoch 2/10, Loss: 0.46212077140808105
Epoch 3/10, Loss: 0.6118588447570801
Epoch 4/10, Loss: 0.39754223823547363
Epoch 5/10, Loss: 0.4632491171360016
Epoch 6/10, Loss: 0.39722493290901184
Epoch 7/10, Loss: 0.46203699707984924
Epoch 8/10, Loss: 0.525845468044281
Epoch 9/10, Loss: 0.5237642526626587
Epoch 10/10, Loss: 0.40204355120658875
Training model for Historical Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4667236804962158
Epoch 2/10, Loss: 0.4622923731803894
Epoch 3/10, Loss: 0.39318034052848816
Epoch 4/10, Loss: 0.46204739809036255
Epoch 5/10, Loss: 0.4010013937950134
Epoch 6/10, Loss: 0.46436551213264465
Epoch 7/10, Loss: 0.46231162548065186
Epoch 8/10, Loss: 0.30741509795188904
Epoch 9/10, Loss: 0.577991783618927
Epoch 10/10, Loss: 0.464781790971756
Training model for Science


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.539563775062561
Epoch 2/10, Loss: 0.39638710021972656
Epoch 3/10, Loss: 0.5380809903144836
Epoch 4/10, Loss: 0.39610764384269714
Epoch 5/10, Loss: 0.39953771233558655
Epoch 6/10, Loss: 0.3909679055213928
Epoch 7/10, Loss: 0.6328111290931702
Epoch 8/10, Loss: 0.5475412607192993
Epoch 9/10, Loss: 0.463397353887558
Epoch 10/10, Loss: 0.39892566204071045
Training model for Mystery


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.46507540345191956
Epoch 2/10, Loss: 0.2183750569820404
Epoch 3/10, Loss: 0.4642665982246399
Epoch 4/10, Loss: 0.38783496618270874
Epoch 5/10, Loss: 0.6423743963241577
Epoch 6/10, Loss: 0.5255969762802124
Epoch 7/10, Loss: 0.5955123901367188
Epoch 8/10, Loss: 0.46241721510887146
Epoch 9/10, Loss: 0.39650672674179077
Epoch 10/10, Loss: 0.4626135230064392
Training model for Contemporary


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.589992344379425
Epoch 2/10, Loss: 0.305400550365448
Epoch 3/10, Loss: 0.3901953101158142
Epoch 4/10, Loss: 0.7219871282577515
Epoch 5/10, Loss: 0.2428559809923172
Epoch 6/10, Loss: 0.46643131971359253
Epoch 7/10, Loss: 0.19528748095035553
Epoch 8/10, Loss: 0.389587938785553
Epoch 9/10, Loss: 0.6002389788627625
Epoch 10/10, Loss: 0.23789891600608826
Training model for Novels


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.31014859676361084
Epoch 2/10, Loss: 0.21470578014850616
Epoch 3/10, Loss: 0.38789311051368713
Epoch 4/10, Loss: 0.39506953954696655
Epoch 5/10, Loss: 0.3129497468471527
Epoch 6/10, Loss: 0.390417218208313
Epoch 7/10, Loss: 0.32773828506469727
Epoch 8/10, Loss: 0.3886682987213135
Epoch 9/10, Loss: 0.21139337122440338
Epoch 10/10, Loss: 0.26874732971191406
Training model for Audiobook


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5501933693885803
Epoch 2/10, Loss: 0.30650585889816284
Epoch 3/10, Loss: 0.5504992604255676
Epoch 4/10, Loss: 0.5655574798583984
Epoch 5/10, Loss: 0.4633801579475403
Epoch 6/10, Loss: 0.3880937695503235
Epoch 7/10, Loss: 0.4703161120414734
Epoch 8/10, Loss: 0.24392497539520264
Epoch 9/10, Loss: 0.2494899034500122
Epoch 10/10, Loss: 0.38739320635795593
Training model for Thriller


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.38735684752464294
Epoch 2/10, Loss: 0.3116750419139862
Epoch 3/10, Loss: 0.5645803213119507
Epoch 4/10, Loss: 0.3873264789581299
Epoch 5/10, Loss: 0.4821839928627014
Epoch 6/10, Loss: 0.47854161262512207
Epoch 7/10, Loss: 0.8731848001480103
Epoch 8/10, Loss: 0.5416310429573059
Epoch 9/10, Loss: 0.3060191571712494
Epoch 10/10, Loss: 0.30677881836891174
Training model for Science Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4815935790538788
Epoch 2/10, Loss: 0.5697762966156006
Epoch 3/10, Loss: 0.3981761038303375
Epoch 4/10, Loss: 0.6527815461158752
Epoch 5/10, Loss: 0.46462589502334595
Epoch 6/10, Loss: 0.12585733830928802
Epoch 7/10, Loss: 0.12658001482486725
Epoch 8/10, Loss: 0.1327376514673233
Epoch 9/10, Loss: 0.2963642477989197
Epoch 10/10, Loss: 0.26536062359809875
Training model for History


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.39620164036750793
Epoch 2/10, Loss: 0.3873351514339447
Epoch 3/10, Loss: 0.3050714135169983
Epoch 4/10, Loss: 0.47657838463783264
Epoch 5/10, Loss: 0.2954410910606384
Epoch 6/10, Loss: 0.30174216628074646
Epoch 7/10, Loss: 0.1951168328523636
Epoch 8/10, Loss: 0.10456773638725281
Epoch 9/10, Loss: 0.20882584154605865
Epoch 10/10, Loss: 0.3985051214694977
Training model for Adventure


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.2956039309501648
Epoch 2/10, Loss: 0.18149933218955994
Epoch 3/10, Loss: 0.3884679079055786
Epoch 4/10, Loss: 0.42873457074165344
Epoch 5/10, Loss: 0.19800910353660583
Epoch 6/10, Loss: 0.2994796633720398
Epoch 7/10, Loss: 0.12368037551641464
Epoch 8/10, Loss: 0.3936085104942322
Epoch 9/10, Loss: 0.2961726188659668
Epoch 10/10, Loss: 0.19629134237766266
Training model for Philosophy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.394802063703537
Epoch 2/10, Loss: 0.1931035816669464
Epoch 3/10, Loss: 0.5021136403083801
Epoch 4/10, Loss: 0.39046943187713623
Epoch 5/10, Loss: 0.39804205298423767
Epoch 6/10, Loss: 0.0852210596203804
Epoch 7/10, Loss: 0.3899667263031006
Epoch 8/10, Loss: 0.39686551690101624
Epoch 9/10, Loss: 0.2069048285484314
Epoch 10/10, Loss: 0.39943793416023254
Training model for Biography


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3911270797252655
Epoch 2/10, Loss: 0.4023967683315277
Epoch 3/10, Loss: 0.07214104384183884
Epoch 4/10, Loss: 0.18236276507377625
Epoch 5/10, Loss: 0.3880949020385742
Epoch 6/10, Loss: 0.40126627683639526
Epoch 7/10, Loss: 0.17910289764404297
Epoch 8/10, Loss: 0.2954467535018921
Epoch 9/10, Loss: 0.29687488079071045
Epoch 10/10, Loss: 0.21963787078857422
Training model for Crime


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.19572974741458893
Epoch 2/10, Loss: 0.07818961888551712
Epoch 3/10, Loss: 0.389210045337677
Epoch 4/10, Loss: 0.29707080125808716
Epoch 5/10, Loss: 0.11028219014406204
Epoch 6/10, Loss: 0.6707258224487305
Epoch 7/10, Loss: 0.29551538825035095
Epoch 8/10, Loss: 0.29714640974998474
Epoch 9/10, Loss: 0.15451140701770782
Epoch 10/10, Loss: 0.22032871842384338
Training model for Self Help


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.09086693823337555
Epoch 2/10, Loss: 0.2956627905368805
Epoch 3/10, Loss: 0.20036062598228455
Epoch 4/10, Loss: 0.4056042432785034
Epoch 5/10, Loss: 0.21339786052703857
Epoch 6/10, Loss: 0.4038906395435333
Epoch 7/10, Loss: 0.4101579189300537
Epoch 8/10, Loss: 0.18395909667015076
Epoch 9/10, Loss: 0.3919121325016022
Epoch 10/10, Loss: 0.29845568537712097
Training model for Psychology


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.19425608217716217
Epoch 2/10, Loss: 0.4821975827217102
Epoch 3/10, Loss: 0.5617338418960571
Epoch 4/10, Loss: 0.5152472257614136
Epoch 5/10, Loss: 0.08311616629362106
Epoch 6/10, Loss: 0.18667761981487274
Epoch 7/10, Loss: 0.18424713611602783
Epoch 8/10, Loss: 0.2955438494682312
Epoch 9/10, Loss: 0.1929691582918167
Epoch 10/10, Loss: 0.2956337630748749


In [74]:
# Test the models
# Returns the accuracy, precision, recall, and F1 score of the model on the test data
def test_models(genres):
    # Create a dictionary to store the metrics
    results = {}
    # Run through each genre
    for genre in genres:
        # Load the model from the file (if it exists)
        if os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
            model = GenreRNN(numwords, 128, 128, 2)
            model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
            model.eval()
            # Extract the relevant columns from the dataframe
            genredf = tokenizeddf[['Book', 'Description', genre]]
            # Encode the genre column
            genredf[genre] = le.fit_transform(genredf[genre])
            # Train test split the data
            genretrain, genretest = train_test_split(genredf, test_size=0.3, random_state=42)
            genretestdataset = GenreDataset(genretest, genre)
            genretestloader = GenreDataLoader(genretestdataset, batch_size=32, shuffle=True)
            # Set the device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            # Send the model to the device
            model.to(device)
            # Initialize the metrics
            tp = 0
            tn = 0
            fp = 0
            fn = 0
            # Turn off gradients
            with torch.no_grad():
                for plots, genres in genretestloader:
                    plots = plots.to(device)
                    genres = genres.to(device)
                    # Get the outputs
                    outputs = model(plots)
                    _, preds = torch.max(outputs, 1)
                    # Increment the metrics
                    newtp = torch.sum((preds == 1) & (genres == 1)).item()
                    newtn = torch.sum((preds == 0) & (genres == 0)).item()
                    newfp = torch.sum((preds == 1) & (genres == 0)).item()
                    newfn = torch.sum((preds == 0) & (genres == 1)).item()
                    tp += newtp
                    tn += newtn
                    fp += newfp
                    fn += newfn
            # Calculate the metrics
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp) if tp + fp != 0 else 0
            recall = tp / (tp + fn) if tp + fn != 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0                    
            results[genre] = { "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1 }
    return results

In [75]:
# Write the results to a file
results = test_models(genres)
with open("goodreads_rnn_results.txt", "w") as f:
    for genre, metrics in results.items():
        f.write(f"{genre}:\n")
        for metric, value in metrics.items():
            f.write(f"{metric}: {value}\n")
        f.write("\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a

In [None]:
# Function to get the genre scores for a plot
# Returns a dictionary with the genre as the key and the score (0-1) as the value
def predict_genres(description):
    # Tokenize and pad the plot
    description = nltk.word_tokenize(description.lower())
    description = words2ints(description)
    description = pad_description(description)
    description = torch.tensor(description, dtype=torch.long).unsqueeze(0)
    # Create a dictionary to store the scores
    scores = {}
    # For each genre, load the model and get the score
    for genre in genres:
        # Check if the model exists
        if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
            print(f"Model for {genre} does not exist")
            continue
        print(f"Predicting {genre}")
        model = GenreRNN(numwords, 128, 128, 2)
        model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
        model.eval()
        with torch.no_grad():
            # Get the output of the model
            output = model(description)
            # Convert the output to a probability
            prob = torch.nn.functional.softmax(output, dim=1)
            # Get the probability of the plot being in the genre
            score = prob[0][1].item()
            # Add the score to the dictionary
            scores[genre] = score
    return scores