In [76]:
# Import PyTorch
import torch
# Get pandas for data manipulation
import pandas as pd
# Import nltk for text processing
import nltk
# Import os for file manipulation
import os
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split
# Import LabelEncoder from sklearn for encoding labels
from sklearn.preprocessing import LabelEncoder

In [77]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [78]:
# Load the data from Datasets/goodreads_data_onehot_genres.csv
data = pd.read_csv('Datasets/goodreads_data_onehot_genres.csv')
# Show the first few rows of the data
print(data.head())
print(data.shape)

                                                Book  \
0                              To Kill a Mockingbird   
1  Harry Potter and the Philosopher’s Stone (Harr...   
2                                Pride and Prejudice   
3                          The Diary of a Young Girl   
4                                  The Little Prince   

                                         Description  Fantasy  Adult  \
0  The unforgettable novel of a childhood in a sl...        0      1   
1  Harry Potter thinks he is an ordinary boy - un...        1      1   
2  Since its immediate success in 1813, Pride and...        0      0   
3  Discovered in the attic in which she spent the...        0      0   
4  A pilot stranded in the desert awakes one morn...        1      1   

   Historical  Roman  Romance  Young Adult  Historical Fiction  Science  ...  \
0           1      0        0            1                   1        0  ...   
1           0      0        0            1                   0        

In [79]:
# Create a dataframe with the plot column tokenized and lowercased
tokenizeddf = data.copy()
tokenizeddf['Description'] = tokenizeddf['Description'].apply(lambda x: nltk.word_tokenize(x.lower()))

In [80]:
# Get the length of the longest plot
maxlen = tokenizeddf['Description'].apply(len).max()

# Get the set of all words in the plot column
wordset = set()
for plot in tokenizeddf['Description']:
    wordset.update(plot)
# Get the number of unique words
numwords = len(wordset)
print("Number of unique words: ", numwords)

# Create a dictionary that maps words to integers
word2int = {word: i for i, word in enumerate(wordset)}

# Function to convert a list of words to a list of integers
def words2ints(words):
    # Run through each word in the list
    ints = []
    for word in words:
        # If the word is in the dictionary, add the integer to the list
        if word in word2int:
            ints.append(word2int[word])
    return ints

Number of unique words:  80123


In [81]:
# Convert the Description column to a list of integers
tokenizeddf['Description'] = tokenizeddf['Description'].apply(words2ints)

# Pad the sequences to the maximum length
def pad_description(description):
    return description + [0] * (maxlen - len(description))

tokenizeddf['Description'] = tokenizeddf['Description'].apply(pad_description)

In [82]:
# Create a label encoder
le = LabelEncoder()

In [83]:
# Dataset class for genres
class GenreDataset(torch.utils.data.Dataset):
    def __init__(self, data, genre):
        self.plot = data["Description"].values
        self.genre = data[genre].values

    def __len__(self):
        return len(self.plot)
    
    def __getitem__(self, i):
        plot = self.plot[i]
        genre = self.genre[i]
        return torch.tensor(plot, dtype=torch.long), torch.tensor(genre, dtype=torch.float)
    
# Dataloader class for genres
class GenreDataLoader(torch.utils.data.DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle)

# RNN class for genres
# Create the RNN to classify the plots as action or not
class GenreRNN(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(GenreRNN, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_size)
        self.sigmoid = torch.nn.Sigmoid()
        self.hidden_dim = hidden_dim
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])        
        return out

In [84]:
# Function to train a model
def train_genre_rnn(genre, epochs, embed, hidden, output):
    genremodel = GenreRNN(numwords, embed, hidden, output)
    # Extract the relevant columns from the dataframe
    genredf = tokenizeddf[['Book', 'Description', genre]]
    # Encode the genre column
    genredf[genre] = le.fit_transform(genredf[genre])
    # Train test split the data
    # We won't use the test data in this function
    genretrain, genretest = train_test_split(genredf, test_size=0.2, random_state=42)
    # Create a GenreDataset object
    genretraindataset = GenreDataset(genretrain, genre)
    # Create a DataLoader
    genretrainloader = GenreDataLoader(genretraindataset, batch_size=32, shuffle=True)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(genremodel.parameters(), lr=0.001)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Send the model to the device
    genremodel.to(device)

    # Train the model
    num_epochs = epochs
    for epoch in range(num_epochs):
        epoch_loss = 0
        genremodel.train()
        for plots, genres in genretrainloader:
            # Send the data to the device
            plots = plots.to(device)
            genres = genres.to(device)
            outputs = genremodel(plots)
            loss = criterion(outputs, genres.long())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Return the model
    return genremodel

In [85]:
# Save a model
def save_model(model, genre):
    if not os.path.exists("models/rnn"):
        os.makedirs("models/rnn")
    torch.save(model.state_dict(), f"models/rnn/{genre}modelgoodreads.pth")

In [86]:
# Get all the genres
genres = data.columns[2:]
print(genres)

Index(['Fantasy', 'Adult', 'Historical', 'Roman', 'Romance', 'Young Adult',
       'Historical Fiction', 'Science', 'Mystery', 'Contemporary', 'Thriller',
       'Science Fiction', 'History', 'Adventure', 'Philosophy', 'Biography',
       'Crime', 'Self Help', 'Psychology', 'Mystery Thriller', 'Memoir',
       'Childrens', 'Humor', 'Suspense', 'Horror'],
      dtype='object')


In [87]:
# Train a model for every genre
for genre in genres:
    print(f"Training model for {genre}")
    model = train_genre_rnn(genre, 10, 128, 128, 2)
    save_model(model, genre)

Training model for Fantasy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.7437699437141418
Epoch 2/10, Loss: 0.5633482336997986
Epoch 3/10, Loss: 0.5743919014930725
Epoch 4/10, Loss: 0.5628390312194824
Epoch 5/10, Loss: 0.46784791350364685
Epoch 6/10, Loss: 0.49632784724235535
Epoch 7/10, Loss: 0.4817148745059967
Epoch 8/10, Loss: 0.4060753583908081
Epoch 9/10, Loss: 0.4597040116786957
Epoch 10/10, Loss: 0.46697500348091125
Training model for Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5656182169914246
Epoch 2/10, Loss: 0.5623360276222229
Epoch 3/10, Loss: 0.6528646945953369
Epoch 4/10, Loss: 0.7647673487663269
Epoch 5/10, Loss: 0.6566473841667175
Epoch 6/10, Loss: 0.5830168128013611
Epoch 7/10, Loss: 0.5628566741943359
Epoch 8/10, Loss: 0.6628420948982239
Epoch 9/10, Loss: 0.7185294032096863
Epoch 10/10, Loss: 0.5623360276222229
Training model for Historical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.9488518238067627
Epoch 2/10, Loss: 0.5649139285087585
Epoch 3/10, Loss: 1.0035985708236694
Epoch 4/10, Loss: 0.5656024217605591
Epoch 5/10, Loss: 0.45083925127983093
Epoch 6/10, Loss: 0.6753036379814148
Epoch 7/10, Loss: 0.697521448135376
Epoch 8/10, Loss: 0.5660068392753601
Epoch 9/10, Loss: 0.5664731860160828
Epoch 10/10, Loss: 0.261932373046875
Training model for Roman


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.3486463725566864
Epoch 2/10, Loss: 0.37125441431999207
Epoch 3/10, Loss: 0.36592137813568115
Epoch 4/10, Loss: 0.64484041929245
Epoch 5/10, Loss: 0.7889080047607422
Epoch 6/10, Loss: 0.8501541614532471
Epoch 7/10, Loss: 0.5866159200668335
Epoch 8/10, Loss: 0.4505630433559418
Epoch 9/10, Loss: 0.19036400318145752
Epoch 10/10, Loss: 0.3375105559825897
Training model for Romance


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.7888787388801575
Epoch 2/10, Loss: 0.21952082216739655
Epoch 3/10, Loss: 0.7429168224334717
Epoch 4/10, Loss: 0.19862933456897736
Epoch 5/10, Loss: 0.5624386668205261
Epoch 6/10, Loss: 0.5704805850982666
Epoch 7/10, Loss: 0.38798654079437256
Epoch 8/10, Loss: 0.6906601786613464
Epoch 9/10, Loss: 0.6863139271736145
Epoch 10/10, Loss: 0.7539458870887756
Training model for Young Adult


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4614607095718384
Epoch 2/10, Loss: 0.25540900230407715
Epoch 3/10, Loss: 0.5965709686279297
Epoch 4/10, Loss: 0.29943251609802246
Epoch 5/10, Loss: 0.45069316029548645
Epoch 6/10, Loss: 0.461218923330307
Epoch 7/10, Loss: 0.7829858660697937
Epoch 8/10, Loss: 0.6925715804100037
Epoch 9/10, Loss: 0.19148968160152435
Epoch 10/10, Loss: 0.5747390985488892
Training model for Historical Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4536530077457428
Epoch 2/10, Loss: 0.45168164372444153
Epoch 3/10, Loss: 0.5863354206085205
Epoch 4/10, Loss: 0.6985098719596863
Epoch 5/10, Loss: 0.5665558576583862
Epoch 6/10, Loss: 0.5740827918052673
Epoch 7/10, Loss: 0.8036473393440247
Epoch 8/10, Loss: 0.45674002170562744
Epoch 9/10, Loss: 0.5811564922332764
Epoch 10/10, Loss: 0.45056164264678955
Training model for Science


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5812374353408813
Epoch 2/10, Loss: 0.7590122222900391
Epoch 3/10, Loss: 0.6740452647209167
Epoch 4/10, Loss: 0.34440168738365173
Epoch 5/10, Loss: 0.5916896462440491
Epoch 6/10, Loss: 0.3285035490989685
Epoch 7/10, Loss: 0.19198335707187653
Epoch 8/10, Loss: 0.45068037509918213
Epoch 9/10, Loss: 0.3433469831943512
Epoch 10/10, Loss: 0.3288854658603668
Training model for Mystery


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.450605183839798
Epoch 2/10, Loss: 0.3113481104373932
Epoch 3/10, Loss: 0.4645291268825531
Epoch 4/10, Loss: 0.19122864305973053
Epoch 5/10, Loss: 0.4505629241466522
Epoch 6/10, Loss: 0.3177720606327057
Epoch 7/10, Loss: 0.29458561539649963
Epoch 8/10, Loss: 0.3347831666469574
Epoch 9/10, Loss: 0.46427640318870544
Epoch 10/10, Loss: 0.45604297518730164
Training model for Contemporary


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4537210762500763
Epoch 2/10, Loss: 0.1826978176832199
Epoch 3/10, Loss: 0.5752328038215637
Epoch 4/10, Loss: 0.45103922486305237
Epoch 5/10, Loss: 0.3050188422203064
Epoch 6/10, Loss: 0.4506066143512726
Epoch 7/10, Loss: 0.4539966285228729
Epoch 8/10, Loss: 0.4517108201980591
Epoch 9/10, Loss: 0.7689967155456543
Epoch 10/10, Loss: 0.20853041112422943
Training model for Thriller


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5973394513130188
Epoch 2/10, Loss: 0.12749122083187103
Epoch 3/10, Loss: 0.29432204365730286
Epoch 4/10, Loss: 0.6128364205360413
Epoch 5/10, Loss: 0.33862611651420593
Epoch 6/10, Loss: 0.3247836232185364
Epoch 7/10, Loss: 0.5767145156860352
Epoch 8/10, Loss: 0.45387542247772217
Epoch 9/10, Loss: 0.29084286093711853
Epoch 10/10, Loss: 0.6218640208244324
Training model for Science Fiction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.8220483660697937
Epoch 2/10, Loss: 0.456021785736084
Epoch 3/10, Loss: 0.3008001744747162
Epoch 4/10, Loss: 0.5871404409408569
Epoch 5/10, Loss: 0.1647920310497284
Epoch 6/10, Loss: 0.4562464654445648
Epoch 7/10, Loss: 0.15740272402763367
Epoch 8/10, Loss: 0.13827970623970032
Epoch 9/10, Loss: 0.6004595756530762
Epoch 10/10, Loss: 0.6598131656646729
Training model for History


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.644417405128479
Epoch 2/10, Loss: 0.4588932991027832
Epoch 3/10, Loss: 0.2887638807296753
Epoch 4/10, Loss: 0.4521556794643402
Epoch 5/10, Loss: 0.4539482593536377
Epoch 6/10, Loss: 0.3029043972492218
Epoch 7/10, Loss: 0.46066340804100037
Epoch 8/10, Loss: 0.3078257441520691
Epoch 9/10, Loss: 0.2909901440143585
Epoch 10/10, Loss: 0.4790870249271393
Training model for Adventure


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.28686413168907166
Epoch 2/10, Loss: 0.08664066344499588
Epoch 3/10, Loss: 0.4704922139644623
Epoch 4/10, Loss: 0.28866684436798096
Epoch 5/10, Loss: 0.10636720061302185
Epoch 6/10, Loss: 0.09546517580747604
Epoch 7/10, Loss: 0.4762395918369293
Epoch 8/10, Loss: 0.2918124198913574
Epoch 9/10, Loss: 0.2870141863822937
Epoch 10/10, Loss: 0.2871178090572357
Training model for Philosophy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.29009345173835754
Epoch 2/10, Loss: 0.6563534140586853
Epoch 3/10, Loss: 0.11385060101747513
Epoch 4/10, Loss: 0.5378159880638123
Epoch 5/10, Loss: 0.2871500253677368
Epoch 6/10, Loss: 0.2872980535030365
Epoch 7/10, Loss: 0.09752201288938522
Epoch 8/10, Loss: 0.49351105093955994
Epoch 9/10, Loss: 0.11021769791841507
Epoch 10/10, Loss: 0.28699055314064026
Training model for Biography


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.2879161536693573
Epoch 2/10, Loss: 0.2885831296443939
Epoch 3/10, Loss: 0.4704859256744385
Epoch 4/10, Loss: 0.45767977833747864
Epoch 5/10, Loss: 0.2873322665691376
Epoch 6/10, Loss: 0.2961301803588867
Epoch 7/10, Loss: 0.28777578473091125
Epoch 8/10, Loss: 0.9022341370582581
Epoch 9/10, Loss: 0.28805822134017944
Epoch 10/10, Loss: 0.28983232378959656
Training model for Crime


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.0960390642285347
Epoch 2/10, Loss: 0.28949132561683655
Epoch 3/10, Loss: 0.2980148494243622
Epoch 4/10, Loss: 0.28683707118034363
Epoch 5/10, Loss: 0.2870723307132721
Epoch 6/10, Loss: 0.2917417585849762
Epoch 7/10, Loss: 0.28821834921836853
Epoch 8/10, Loss: 0.2895754277706146
Epoch 9/10, Loss: 0.6750814914703369
Epoch 10/10, Loss: 0.13666166365146637
Training model for Self Help


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4677560329437256
Epoch 2/10, Loss: 0.4690755605697632
Epoch 3/10, Loss: 0.7328245639801025
Epoch 4/10, Loss: 0.12122010439634323
Epoch 5/10, Loss: 0.07805191725492477
Epoch 6/10, Loss: 0.28729936480522156
Epoch 7/10, Loss: 0.49308300018310547
Epoch 8/10, Loss: 0.2876300513744354
Epoch 9/10, Loss: 0.47347140312194824
Epoch 10/10, Loss: 0.289237380027771
Training model for Psychology


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4657033383846283
Epoch 2/10, Loss: 0.29959535598754883
Epoch 3/10, Loss: 0.47163328528404236
Epoch 4/10, Loss: 0.4673483669757843
Epoch 5/10, Loss: 0.11859937757253647
Epoch 6/10, Loss: 0.2891131341457367
Epoch 7/10, Loss: 0.2882661521434784
Epoch 8/10, Loss: 0.8213112950325012
Epoch 9/10, Loss: 0.2869211733341217
Epoch 10/10, Loss: 0.6175020337104797
Training model for Mystery Thriller


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.28737255930900574
Epoch 2/10, Loss: 0.7114090323448181
Epoch 3/10, Loss: 0.1593025028705597
Epoch 4/10, Loss: 0.07589877396821976
Epoch 5/10, Loss: 0.649954617023468
Epoch 6/10, Loss: 0.4768289625644684
Epoch 7/10, Loss: 0.28920769691467285
Epoch 8/10, Loss: 0.07698415964841843
Epoch 9/10, Loss: 0.28879863023757935
Epoch 10/10, Loss: 0.4626530706882477
Training model for Memoir


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.09455563873052597
Epoch 2/10, Loss: 0.8036200404167175
Epoch 3/10, Loss: 0.28825119137763977
Epoch 4/10, Loss: 0.29880234599113464
Epoch 5/10, Loss: 0.095863938331604
Epoch 6/10, Loss: 0.10075827687978745
Epoch 7/10, Loss: 0.28945305943489075
Epoch 8/10, Loss: 0.46917951107025146
Epoch 9/10, Loss: 0.08375835418701172
Epoch 10/10, Loss: 0.29128018021583557
Training model for Childrens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.286919504404068
Epoch 2/10, Loss: 0.07436525076627731
Epoch 3/10, Loss: 0.09007657319307327
Epoch 4/10, Loss: 0.28697559237480164
Epoch 5/10, Loss: 0.08929025381803513
Epoch 6/10, Loss: 0.2874559462070465
Epoch 7/10, Loss: 0.2873389720916748
Epoch 8/10, Loss: 0.07370800524950027
Epoch 9/10, Loss: 0.09817212074995041
Epoch 10/10, Loss: 0.07367926090955734
Training model for Humor


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.2869810163974762
Epoch 2/10, Loss: 0.29498156905174255
Epoch 3/10, Loss: 0.28704628348350525
Epoch 4/10, Loss: 0.07158732414245605
Epoch 5/10, Loss: 0.2892436683177948
Epoch 6/10, Loss: 0.04852282628417015
Epoch 7/10, Loss: 0.5276529788970947
Epoch 8/10, Loss: 0.06073927879333496
Epoch 9/10, Loss: 0.06910865753889084
Epoch 10/10, Loss: 0.06283079087734222
Training model for Suspense


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.5846518874168396
Epoch 2/10, Loss: 0.06714039295911789
Epoch 3/10, Loss: 0.28839126229286194
Epoch 4/10, Loss: 0.06447945535182953
Epoch 5/10, Loss: 0.5008795261383057
Epoch 6/10, Loss: 0.08207924664020538
Epoch 7/10, Loss: 0.06598525494337082
Epoch 8/10, Loss: 0.0490243099629879
Epoch 9/10, Loss: 0.05649871006608009
Epoch 10/10, Loss: 0.543645441532135
Training model for Horror


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])


Epoch 1/10, Loss: 0.4795989990234375
Epoch 2/10, Loss: 0.06478843837976456
Epoch 3/10, Loss: 0.08125442266464233
Epoch 4/10, Loss: 0.5639021992683411
Epoch 5/10, Loss: 0.2948415279388428
Epoch 6/10, Loss: 0.06991255283355713
Epoch 7/10, Loss: 0.542995035648346
Epoch 8/10, Loss: 0.04564019665122032
Epoch 9/10, Loss: 0.06539171189069748
Epoch 10/10, Loss: 0.5132126808166504


In [88]:
# Test the models
# Returns the accuracy, precision, recall, and F1 score of the model on the test data
def test_models(genres):
    # Create a dictionary to store the metrics
    results = {}
    # Run through each genre
    for genre in genres:
        # Load the model from the file (if it exists)
        if os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
            model = GenreRNN(numwords, 128, 128, 2)
            model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
            model.eval()
            # Extract the relevant columns from the dataframe
            genredf = tokenizeddf[['Book', 'Description', genre]]
            # Encode the genre column
            genredf[genre] = le.fit_transform(genredf[genre])
            # Train test split the data
            genretrain, genretest = train_test_split(genredf, test_size=0.3, random_state=42)
            genretestdataset = GenreDataset(genretest, genre)
            genretestloader = GenreDataLoader(genretestdataset, batch_size=32, shuffle=True)
            # Set the device
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            # Send the model to the device
            model.to(device)
            # Initialize the metrics
            tp = 0
            tn = 0
            fp = 0
            fn = 0
            # Turn off gradients
            with torch.no_grad():
                for plots, genres in genretestloader:
                    plots = plots.to(device)
                    genres = genres.to(device)
                    # Get the outputs
                    outputs = model(plots)
                    _, preds = torch.max(outputs, 1)
                    # Increment the metrics
                    newtp = torch.sum((preds == 1) & (genres == 1)).item()
                    newtn = torch.sum((preds == 0) & (genres == 0)).item()
                    newfp = torch.sum((preds == 1) & (genres == 0)).item()
                    newfn = torch.sum((preds == 0) & (genres == 1)).item()
                    tp += newtp
                    tn += newtn
                    fp += newfp
                    fn += newfn
            # Calculate the metrics
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp) if tp + fp != 0 else 0
            recall = tp / (tp + fn) if tp + fn != 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0                    
            results[genre] = { "Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1 }
    return results

In [89]:
# Write the results to a file
results = test_models(genres)
with open("goodreads_rnn_results.txt", "w") as f:
    for genre, metrics in results.items():
        f.write(f"{genre}:\n")
        for metric, value in metrics.items():
            f.write(f"{metric}: {value}\n")
        f.write("\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genredf[genre] = le.fit_transform(genredf[genre])
A value is trying to be set on a copy of a slice from a

In [127]:
# Predict the probability that a plot is a certain genre
def score_genre(description,genre):
    # Get the tokenized version of the plot
    tokenplot = nltk.word_tokenize(description.lower())
    tokenplot = words2ints(tokenplot)
    tokenplot = pad_description(tokenplot)
    # Check if the model exists and load it
    if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
        print("Model does not exist")
        return None
    model = GenreRNN(numwords, 128, 128, 2)
    model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
    model.eval()
    # Convert the plot to a tensor
    tokenplot = torch.tensor(tokenplot, dtype=torch.long).unsqueeze(0)
    # Get the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Send the plot and the model to the device
    tokenplot = tokenplot.to(device)
    model.to(device)
    # Get the output of the model
    output = model(tokenplot)
    # Turn the output into a probability that the plot is the genre
    prob = torch.nn.functional.softmax(output, dim=1)[:, 1]
    return prob.item()

# Test the function
print(score_genre(data['Description'][0], 'Fantasy'))

0.25060588121414185


In [124]:
# Function to get the genre score for a plot
def predict_genres(description, genre):
    # Tokenize and pad the plot
    description = nltk.word_tokenize(description.lower())
    description = words2ints(description)
    description = pad_description(description)
    description = torch.tensor(description, dtype=torch.long).unsqueeze(0)
    # Check if the model exists
    if not os.path.exists(f"models/rnn/{genre}modelgoodreads.pth"):
        print(f"Model for {genre} does not exist")
        return None
    #print(f"Predicting {genre}")
    # Load the model
    model = GenreRNN(numwords, 128, 128, 2)
    model.load_state_dict(torch.load(f"models/rnn/{genre}modelgoodreads.pth"))
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Send the model to the device
    model.to(device)
    # Send the plot to the device
    description = description.to(device)
    # Turn off gradients
    with torch.no_grad():
        # Get the output
        output = model(description)
        _, preds = torch.max(output, 1)
        return preds.item()

In [125]:
# Create a new dataframe with the Book and Description columns
genre_scores = data[['Book', 'Description']]
# Add a new column for the genre scores
for genre in genres:
    genre_scores[genre] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_scores[genre] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [131]:
# Score each plot for each genre
for genre in genres:
    print(f"Scoring {genre}")
    genre_scores[genre] = genre_scores['Description'].apply(lambda x: score_genre(x, genre))

Scoring Fantasy
Scoring Adult
Scoring Historical
Scoring Roman
Scoring Romance
Scoring Young Adult
Scoring Historical Fiction
Scoring Science
Scoring Mystery
Scoring Contemporary
Scoring Thriller
Scoring Science Fiction
Scoring History
Scoring Adventure
Scoring Philosophy
Scoring Biography
Scoring Crime
Scoring Self Help
Scoring Psychology
Scoring Mystery Thriller
Scoring Memoir
Scoring Childrens
Scoring Humor
Scoring Suspense
Scoring Horror


In [132]:
# Save the dataframe to a CSV file
genre_scores.to_csv('Datasets/goodreads_rnn_genre_scores.csv', index=False)