In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm

In [2]:
books = pd.read_csv('book_cleaned.csv')
books['categories'].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Conspiracies,1
475,Brothers and sisters,1
476,Rock musicians,1
477,Community life,1


In [3]:
books['categories'].value_counts().reset_index().query('count > 50')

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Philosophy,117
6,Religion,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [4]:
categories_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

books['simple_categories'] = books['categories'].map(categories_mapping)

In [5]:
from transformers import pipeline
fiction_categories = ['Fiction', 'Nonfiction']
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


In [6]:
sequence = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)[4]
classifier(sequence, fiction_categories)

{'sequence': 'For sixty years, Jewish refugees and their descendants have prospered in the Federal District of Sitka, a "temporary" safe haven created in the wake of revelations of the Holocaust and the shocking 1948 collapse of the fledgling state of Israel. Proud, grateful, and longing to be American, the Jews of the Sitka District have created their own little world in the Alaskan panhandle, a vibrant, gritty, soulful, and complex frontier city that moves to the music of Yiddish. For sixty years they have been left alone, neglected and half-forgotten in a backwater of history. Now the District is set to revert to Alaskan control, and their dream is coming to an end: once again the tides of history threaten to sweep them up and carry them off into the unknown. But homicide detective Meyer Landsman of the District Police has enough problems without worrying about the upcoming Reversion. His life is a shambles, his marriage a wreck, his career a disaster. He and his half-Tlingit partne

In [7]:
def generate_prediction(sequence, categories):
    classifier_output = classifier(sequence, categories)
    max_idx = np.argmax(classifier(sequence, fiction_categories)['scores'])
    max_label = classifier(sequence, fiction_categories)['labels'][max_idx]
    
    return max_label

In [8]:
actual_cats = []
pred_cats = []

for i in tqdm(range(0, 200)):
    sequence = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)[i]
    actual_cats += ["Fiction"]
    pred_cats += [generate_prediction(sequence, fiction_categories)]
    

100%|██████████| 200/200 [01:03<00:00,  3.16it/s]


In [9]:
for i in tqdm(range(0, 200)):
    sequence = books.loc[books['simple_categories'] == "Nonfiction", "description"].reset_index(drop=True)[i]
    actual_cats += ["Nonfiction"]
    pred_cats += [generate_prediction(sequence, fiction_categories)]

100%|██████████| 200/200 [01:00<00:00,  3.32it/s]


In [10]:
preds_df = pd.DataFrame({
    "actual_cats": actual_cats,
    "pred_cats": pred_cats
}
)
preds_df.head()

Unnamed: 0,actual_cats,pred_cats
0,Fiction,Fiction
1,Fiction,Fiction
2,Fiction,Fiction
3,Fiction,Nonfiction
4,Fiction,Fiction


In [11]:
preds_df["correct_pred"] = (
    np.where(preds_df["actual_cats"] == preds_df["pred_cats"], 1, 0)
)

In [12]:
acc = (preds_df["correct_pred"].sum()) / len(preds_df)
print(acc)

0.7875


In [13]:
missing_cat = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)
missing_cat

Unnamed: 0,isbn13,description
0,9780002261982,A new 'Christie for Christmas' -- a full-lengt...
1,9780006280897,Lewis' work on the nature of love divides love...
2,9780006280934,"""In The Problem of Pain, C.S. Lewis, one of th..."
3,9780006380832,Until Vasco da Gama discovered the sea-route t...
4,9780006470229,A new-cover reissue of the fourth book in the ...
...,...,...
1449,9788125026600,Not only does Nietzsche for Beginners delve in...
1450,9788171565641,"Forster's lively, informed originality and wit..."
1451,9788172235222,On A Train Journey Home To North India After L...
1452,9788173031014,This book tells the tale of a man who goes on ...


In [14]:
isbn = []
preds = []

for i in tqdm(range(0, len(missing_cat))):
    seq = missing_cat['description'][i]
    isbn += [missing_cat['isbn13'][i]]
    preds += [generate_prediction(seq, fiction_categories)]
    

100%|██████████| 1454/1454 [06:01<00:00,  4.02it/s]


In [54]:
missing_preds_df = pd.DataFrame({
    "isbn13": isbn,
    "predicted_categories": preds
})
missing_preds_df.head()

Unnamed: 0,isbn13,predicted_categories
0,9780002261982,Fiction
1,9780006280897,Nonfiction
2,9780006280934,Nonfiction
3,9780006380832,Nonfiction
4,9780006470229,Fiction


In [65]:
books = pd.merge(books, missing_preds_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns=["predicted_categories", "predicted_categories_x", "predicted_categories_y"])

In [68]:
books.shape

(5197, 14)

In [69]:
books[books['categories'].str.lower().isin([
    "romance",
    "science fiction",
    "fantasy",
    "scifi",
    "horror",
    "mystery",
    "thriller",
    "comedy",
    "crime",
    "historical"
])].shape

(16, 14)

In [70]:
books.to_csv('book_with_categories.csv', index=False)