# Using a Neural Network approach to see whether an author's name constitutes a critical/commercial hit or flop:


In [7]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import functions
from sklearn.preprocessing import LabelEncoder, StandardScaler
%pip install torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch.optim as optim
from sklearn.metrics import f1_score 

Note: you may need to restart the kernel to use updated packages.


Checking the amount of unique authors in the dataset. Credit to ChatGPT for showing how to do so:


In [2]:
df_metadata = functions.get_data()
num_authors = df_metadata['author_name'].nunique()
print(num_authors)

12877


Over 12,000 authors. Statiscally, most authors have 1 book, so I will check if that is the case. Credit to ChatGPT which showed me how to code up a solution to this:

In [3]:
df_metadata['author_name'].value_counts().describe()

count    12877.000000
mean         1.553157
std          1.800488
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         76.000000
Name: count, dtype: float64

This shows that the majority of authors have only 1 book to their names making it likely that they are debut authors. Lastly, it can be shown that the maximum number of books an author has got on the Amazon storefront is 76. This shows that author name alone is not enough to determine a critical or commercial success. As such, my neural network will use the context of this information to determine whether the general audience would support a given book both financially and critically. 

## The Foundations of the Neural Network:

Credit to ChatGPT for showing me how to lay the groundwork for my neural network:

In [8]:
class BooksDataset(Dataset):
    def __init__(self, authors, features, labels):
        self.authors = torch.tensor(authors, dtype=torch.long)       # author IDs for embeddings
        self.features = torch.tensor(features, dtype=torch.float32)  # numeric features
        self.labels = torch.tensor(labels, dtype=torch.float32)      # target

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.authors[idx], self.features[idx], self.labels[idx]
    
class AuthorNet(nn.Module):
    def __init__(self, num_authors, embedding_dim, num_numeric_features):
        super(AuthorNet, self).__init__()
        # Author embedding
        self.embedding = nn.Embedding(num_authors, embedding_dim)
        # Fully connected layers for numeric features
        self.fc_numeric = nn.Sequential(
            nn.Linear(num_numeric_features, 16),
            nn.ReLU()
        )
        # Combine embeddings + numeric features
        self.fc_combined = nn.Sequential(
            nn.Linear(embedding_dim + 16, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()  # binary output
        )

    def forward(self, author_ids, numeric_features):
        x_author = self.embedding(author_ids)
        x_numeric = self.fc_numeric(numeric_features)
        x = torch.cat([x_author, x_numeric], dim=1)
        x = self.fc_combined(x)
        return x

X_author_train, X_author_test, X_num_train, X_num_test, y_train, y_test = functions.test_train_split()

# Map author IDs to consecutive integers
author_to_idx = {author: i for i, author in enumerate(sorted(set(X_author_train)))}
X_author_train = np.array([author_to_idx[a] for a in X_author_train])
X_author_test = np.array([author_to_idx.get(a, 0) for a in X_author_test])  # unknown authors -> 0
num_authors = len(author_to_idx)

# Scale numeric features
scaler = StandardScaler()
X_num_train = scaler.fit_transform(X_num_train)
X_num_test = scaler.transform(X_num_test)

num_numeric_features = X_num_train.shape[1]
embedding_dim = 16 

train_dataset = BooksDataset(X_author_train, X_num_train, y_train)
test_dataset = BooksDataset(X_author_test, X_num_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AuthorNet(num_authors=num_authors, embedding_dim=embedding_dim, num_numeric_features=num_numeric_features)
model = model.to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for author_ids, numeric_features, labels in train_loader:
        author_ids = author_ids.to(device)
        numeric_features = numeric_features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(author_ids, numeric_features).view(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for author_ids, numeric_features, labels in test_loader:
        author_ids = author_ids.to(device)
        numeric_features = numeric_features.to(device)
        labels = labels.to(device)

        outputs = model(author_ids, numeric_features).view(-1)
        predicted = (outputs > 0.5).float()

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

accuracy = np.mean(np.array(y_true) == np.array(y_pred))
f1 = f1_score(y_true, y_pred)

print(f"Test Accuracy: {accuracy:.3f}")
print(f"Test F1 Score: {f1:.3f}")

Epoch 1, Loss: 0.5555
Epoch 2, Loss: 0.2473
Epoch 3, Loss: 0.0996
Epoch 4, Loss: 0.0610
Epoch 5, Loss: 0.0452
Epoch 6, Loss: 0.0368
Epoch 7, Loss: 0.0303
Epoch 8, Loss: 0.0265
Epoch 9, Loss: 0.0229
Epoch 10, Loss: 0.0203
Test Accuracy: 0.987
Test F1 Score: 0.979
