# Using a Neural Network approach to see whether an author's name constitutes a critical/commercial hit or flop:


In [8]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import functions
from sklearn.preprocessing import LabelEncoder
%pip install torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.optim as optim

Note: you may need to restart the kernel to use updated packages.


Checking the amount of unique authors in the dataset. Credit to ChatGPT for showing how to do so:


In [4]:
df_metadata = functions.get_data()
num_authors = df_metadata['author_name'].nunique()
print(num_authors)

12877


Over 12,000 authors. Statiscally, most authors have 1 book, so I will check if that is the case. Credit to ChatGPT which showed me how to code up a solution to this:

In [3]:
df_metadata['author_name'].value_counts().describe()

count    12877.000000
mean         1.553157
std          1.800488
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         76.000000
Name: count, dtype: float64

This shows that the majority of authors have only 1 book to their names making it likely that they are debut authors. Lastly, it can be shown that the maximum number of books an author has got on the Amazon storefront is 76. This shows that author name alone is not enough to determine a critical or commercial success. As such, my neural network will use the context of this information to determine whether the general audience would support a given book both financially and critically. 

## The Foundations of the Neural Network:

Credit to ChatGPT for showing me how to lay the groundwork for my neural network:

In [10]:

# 1️⃣ Define a custom dataset
class BooksDataset(Dataset):
    def __init__(self, authors, features, labels):
        self.authors = torch.tensor(authors, dtype=torch.long)       # author IDs for embeddings
        self.features = torch.tensor(features, dtype=torch.float32)  # numeric features
        self.labels = torch.tensor(labels, dtype=torch.float32)      # target

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.authors[idx], self.features[idx], self.labels[idx]

# 2️⃣ Neural network combining embeddings + numeric features
class AuthorNet(nn.Module):
    def __init__(self, num_authors, embedding_dim, num_numeric_features):
        super(AuthorNet, self).__init__()
        # Author embedding
        self.embedding = nn.Embedding(num_authors, embedding_dim)
        # Fully connected layers for numeric features
        self.fc_numeric = nn.Sequential(
            nn.Linear(num_numeric_features, 16),
            nn.ReLU()
        )
        # Combine embeddings + numeric features
        self.fc_combined = nn.Sequential(
            nn.Linear(embedding_dim + 16, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()  # binary output
        )

    def forward(self, author_ids, numeric_features):
        x_author = self.embedding(author_ids)
        x_numeric = self.fc_numeric(numeric_features)
        x = torch.cat([x_author, x_numeric], dim=1)
        x = self.fc_combined(x)
        return x

# 3️⃣ Get your training/test split
X_author_train, X_author_test, X_num_train, X_num_test, y_train, y_test = functions.test_train_split()

num_authors = len(set(X_author_train))  # ~12,000 in your dataset
num_numeric_features = X_num_train.shape[1]
embedding_dim = 16  # you can adjust

# 4️⃣ Create datasets and dataloaders
train_dataset = BooksDataset(X_author_train, X_num_train, y_train)
test_dataset = BooksDataset(X_author_test, X_num_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 5️⃣ Instantiate model, loss, optimizer
model = AuthorNet(num_authors=num_authors, embedding_dim=embedding_dim, num_numeric_features=num_numeric_features)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 6️⃣ Training loop
for epoch in range(10):
    model.train()
    running_loss = 0.0
    for author_ids, numeric_features, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(author_ids, numeric_features).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

# 7️⃣ Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for author_ids, numeric_features, labels in test_loader:
        outputs = model(author_ids, numeric_features).squeeze()
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test accuracy: {correct/total:.3f}")


KeyError: "['rating number'] not in index"

Index(['author_name', 'publisher', 'publisher_date', 'format', 'page_count',
       'language', 'category_level_2_sub', 'category_level_3_detail',
       'average_rating', 'rating_number', 'price_numeric', 'maybe_date',
       'author_id'],
      dtype='object')