# Practical PyTorch: Classify Authors with a sentence-Level RNN

In [2]:
!pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.13.1-cp38-cp38-macosx_12_0_arm64.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of tensorflow to determine which version is compatible with other requirements. This could take a while.
  Using cached tensorflow-2.13.0-cp38-cp38-macosx_12_0_arm64.whl.metadata (2.6 kB)
Collecting tensorflow-macos==2.13.0 (from tensorflow)
  Using cached tensorflow_macos-2.13.0-cp38-cp38-macosx_12_0_arm64.whl.metadata (3.2 kB)
Collecting absl-py>=1.0.0 (from tensorflow-macos==2.13.0->tensorflow)
  Using cached absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-macos==2.13.0->tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.1.21 (from tensorflow-macos==2.13.0->tensorflow)
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow-macos==2.13.0->tensorflow)
  Using cached gast-0.4.0-

### Imports

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

### Read Input (.csv with Author, Sentence)

In [10]:
data = pd.read_csv('grand_sentences.csv', encoding='latin-1')
data['Sentence'] = data['Sentence'].replace(np.nan, '')
sentences = data['Sentence'].values
authors = data['Author'].values

### Tokenize

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1


sequences = tokenizer.texts_to_sequences(sentences)
max_sequence_len = max([len(seq) for seq in sequences])

### Pad sequences

In [14]:
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='post')

### Encode Author Names

In [21]:
unique_authors = list(set(authors))
author_to_label = {author: idx for idx, author in enumerate(unique_authors)}
labels = np.array([author_to_label[author] for author in authors])  # Convert to numpy array
len(labels)

16887

### Split data into training and testing sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

### Build an RNN model

In [23]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_len))
model.add(LSTM(units=128))
model.add(Dense(units=len(unique_authors), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Fit the model

In [24]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x17c7a8a90>

### Evaluate the model

In [25]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 19.15%
