In [1]:
import csv
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
import json
from IPython.display import clear_output

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score, accuracy_score

In [3]:
titles = []
authors = []
genres = []
summaries = []

with open('./data/book_summaries/booksummaries.txt', 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        titles.append(row[2])
        authors.append(row[3])
        genres.append(row[5])
        summaries.append(row[6])

dataset = pd.DataFrame({
    'title': titles, 
    'author': authors, 
    'summary': summaries,
    'genre': genres}
)

dataset.drop(dataset[dataset['genre']==''].index, inplace=True)

In [4]:
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def process_text(text):
    eng_stopwords = set(stopwords.words('english'))
    text = re.sub('\'', '', text)
    text = re.sub('[^a-zA-Z]', ' ',text)
    text = ' '.join([stemmer.stem(lemmatizer.lemmatize(word)) for word in text.split()])
    text = ' '.join([word for word in text.split() if word not in eng_stopwords])
    text = ' '.join(text.split())
    text = text.lower()
    return text

def process_genre(raw_genre):
    return list(json.loads(raw_genre).values())

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yourlogarithm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yourlogarithm/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
dataset['summary'] = dataset['summary'].apply(process_text)
dataset['genre'] = dataset['genre'].apply(process_genre)

In [6]:
dataset['genre'][0]

['Roman à clef',
 'Satire',
 "Children's literature",
 'Speculative fiction',
 'Fiction']

In [7]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(dataset['genre'])

y = multilabel_binarizer.transform(dataset['genre'])
print(y.shape)

(12841, 227)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(dataset['summary'], y, test_size=0.2)

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10_000)
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train)
x_test_vectorized = tfidf_vectorizer.transform(x_test)

In [12]:
print(x_train_vectorized.shape)
print(y_train.shape)

(10272, 10000)
(10272, 227)


In [13]:
classifier = OneVsRestClassifier(LogisticRegression())

In [14]:
classifier.fit(x_train_vectorized, y_train)
clear_output()

In [22]:
print(f'Train score: {classifier.score(x_train_vectorized, y_train): .2%}')
print(f'Test score: {classifier.score(x_test_vectorized, y_test): .2%}')

Train score:  17.42%
Test score:  7.40%


In [23]:
predictions = classifier.predict(x_test_vectorized)
f1_score(y_test, predictions, average='micro'), accuracy_score(y_test, predictions)

(0.3704714640198511, 0.07395873880887505)

In [24]:
pred_proba = classifier.predict_proba(x_test_vectorized)
t = 0.3
pred_proba = (pred_proba >= t).astype(int)

f1_score(y_test, pred_proba, average="micro"), accuracy_score(y_test, pred_proba)

(0.5357556327366639, 0.11755546905410666)

In [25]:
def predict(m):
    m = process_text(m)
    m_vec = tfidf_vectorizer.transform([m])
    m_pred = classifier.predict(m_vec)
    return multilabel_binarizer.inverse_transform(m_pred)

In [26]:
for i in range(100):
    k = x_test.sample(1).index[0]
    print("Book: ", dataset['title'][k])
    print("Predicted genre: ", predict(x_test[k]))
    print("Actual genre: ", dataset['genre'][k], "\n")

Book:  A Summons to Memphis
Predicted genre:  [()]
Actual genre:  ['Novel'] 

Book:  The Humbling
Predicted genre:  [('Fiction', 'Novel')]
Actual genre:  ['Novel'] 

Book:  The Last Letter Home
Predicted genre:  [('Fiction',)]
Actual genre:  ['Fiction', 'Historical novel'] 

Book:  United!
Predicted genre:  [()]
Actual genre:  ['Novel'] 

Book:  The Candle in the Wind
Predicted genre:  [('Fantasy', 'Speculative fiction')]
Actual genre:  ['Speculative fiction', 'Fantasy'] 

Book:  The Bladerunner
Predicted genre:  [()]
Actual genre:  ['Science Fiction', 'Speculative fiction', 'Dystopia'] 

Book:  The Kid Who Only Hit Homers
Predicted genre:  [()]
Actual genre:  ["Children's literature"] 

Book:  Stormed Fortress
Predicted genre:  [('Fantasy', 'Speculative fiction')]
Actual genre:  ['Fantasy', 'High fantasy'] 

Book:  Dark Gold
Predicted genre:  [()]
Actual genre:  ['Speculative fiction', 'Fantasy', 'Fiction', 'Romance novel'] 

Book:  After 12,000 Years
Predicted genre:  [('Science Fict