In [38]:
import numpy as np 
import pandas as pd
from tqdm import tqdm

In [5]:
books = pd.read_csv('book_cleaned.csv')
books['categories'].value_counts().reset_index()

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
...,...,...
474,Conspiracies,1
475,Brothers and sisters,1
476,Rock musicians,1
477,Community life,1


In [6]:
books['categories'].value_counts().reset_index().query('count > 50')

Unnamed: 0,categories,count
0,Fiction,2111
1,Juvenile Fiction,390
2,Biography & Autobiography,311
3,History,207
4,Literary Criticism,124
5,Philosophy,117
6,Religion,117
7,Comics & Graphic Novels,116
8,Drama,86
9,Juvenile Nonfiction,57


In [10]:
categories_mapping = {'Fiction' : "Fiction",
 'Juvenile Fiction': "Children's Fiction",
 'Biography & Autobiography': "Nonfiction",
 'History': "Nonfiction",
 'Literary Criticism': "Nonfiction",
 'Philosophy': "Nonfiction",
 'Religion': "Nonfiction",
 'Comics & Graphic Novels': "Fiction",
 'Drama': "Fiction",
 'Juvenile Nonfiction': "Children's Nonfiction",
 'Science': "Nonfiction",
 'Poetry': "Fiction"}

books['simple_categories'] = books['categories'].map(categories_mapping).fillna('Other')

In [13]:
books[~(books["simple_categories"].isna())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,description_length_bin,title_and_subtitle,tagged_description,simple_categories
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,101-200,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,200+,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Other
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,51-100,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,21-50,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Other
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,51-100,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,9788172235222,8172235224,Mistaken Identity,Nayantara Sahgal,Indic fiction (English),http://books.google.com/books/content?id=q-tKP...,On A Train Journey Home To North India After L...,2003.0,2.93,324.0,0.0,200+,Mistaken Identity,9788172235222 On A Train Journey Home To North...,Other
5193,9788173031014,8173031010,Journey to the East,Hermann Hesse,Adventure stories,http://books.google.com/books/content?id=rq6JP...,This book tells the tale of a man who goes on ...,2002.0,3.70,175.0,24.0,51-100,Journey to the East,9788173031014 This book tells the tale of a ma...,Other
5194,9788179921623,817992162X,The Monk Who Sold His Ferrari: A Fable About F...,Robin Sharma,Health & Fitness,http://books.google.com/books/content?id=c_7mf...,"Wisdom to Create a Life of Passion, Purpose, a...",2003.0,3.82,198.0,1568.0,101-200,The Monk Who Sold His Ferrari: A Fable About F...,9788179921623 Wisdom to Create a Life of Passi...,Other
5195,9788185300535,8185300534,I Am that,Sri Nisargadatta Maharaj;Sudhakar S. Dikshit,Philosophy,http://books.google.com/books/content?id=Fv_JP...,This collection of the timeless teachings of o...,1999.0,4.51,531.0,104.0,101-200,I Am that: Talks with Sri Nisargadatta Maharaj,9788185300535 This collection of the timeless ...,Nonfiction


In [7]:
from transformers import pipeline
fiction_categories = ['Fiction', 'Nonfiction']
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


In [29]:
sequence = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)[4]
classifier(sequence, fiction_categories)

{'sequence': 'For sixty years, Jewish refugees and their descendants have prospered in the Federal District of Sitka, a "temporary" safe haven created in the wake of revelations of the Holocaust and the shocking 1948 collapse of the fledgling state of Israel. Proud, grateful, and longing to be American, the Jews of the Sitka District have created their own little world in the Alaskan panhandle, a vibrant, gritty, soulful, and complex frontier city that moves to the music of Yiddish. For sixty years they have been left alone, neglected and half-forgotten in a backwater of history. Now the District is set to revert to Alaskan control, and their dream is coming to an end: once again the tides of history threaten to sweep them up and carry them off into the unknown. But homicide detective Meyer Landsman of the District Police has enough problems without worrying about the upcoming Reversion. His life is a shambles, his marriage a wreck, his career a disaster. He and his half-Tlingit partne

In [37]:
def generate_prediction(sequence, categories):
    classifier_output = classifier(sequence, categories)
    max_idx = np.argmax(classifier(sequence, fiction_categories)['scores'])
    max_label = classifier(sequence, fiction_categories)['labels'][max_idx]
    
    return max_label

In [43]:
actual_cats = []
pred_cats = []

for i in tqdm(range(0, 200)):
    sequence = books.loc[books['simple_categories'] == "Fiction", "description"].reset_index(drop=True)[i]
    actual_cats += ["Fiction"]
    pred_cats += [generate_prediction(sequence, fiction_categories)]
    

100%|██████████| 200/200 [00:53<00:00,  3.75it/s]


In [44]:
for i in tqdm(range(0, 200)):
    sequence = books.loc[books['simple_categories'] == "Nonfiction", "description"].reset_index(drop=True)[i]
    actual_cats += ["Nonfiction"]
    pred_cats += [generate_prediction(sequence, fiction_categories)]

100%|██████████| 200/200 [00:57<00:00,  3.46it/s]


In [46]:
preds_df = pd.DataFrame({
    "actual_cats": actual_cats,
    "pred_cats": pred_cats
}
)
preds_df.head()

Unnamed: 0,actual_cats,pred_cats
0,Fiction,Fiction
1,Fiction,Fiction
2,Fiction,Fiction
3,Fiction,Nonfiction
4,Fiction,Fiction


In [47]:
preds_df["correct_pred"] = (
    np.where(preds_df["actual_cats"] == preds_df["pred_cats"], 1, 0)
)

In [51]:
acc = (preds_df["correct_pred"].sum()) / len(preds_df)
print(acc)

0.7875
