In [1]:
import requests
import pandas as pd
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from sklearn.model_selection import train_test_split

# Replace 'YOUR_API_KEY' with your actual API key
api_key = 'AIzaSyA3J7SOTWbxxhxnUCgrX3PbGSwP3KadgmA'

# Define the search query
query = "science fiction"

# Define the URL
url = f"https://www.googleapis.com/books/v1/volumes?q={query}&key={api_key}"

# Send a GET request to the API
response = requests.get(url)

# Convert the response to JSON
data = response.json()

# Extract the necessary information from the data
books = []
for item in data['items']:
    try:
        title = item['volumeInfo']['title']
        description = item['volumeInfo']['description']
        categories = item['volumeInfo']['categories']
        books.append([title, description, categories])
    except KeyError:
        continue

# Convert the list of books to a DataFrame
df = pd.DataFrame(books, columns=['Title', 'Description', 'Categories'])

In [8]:
df.head(15)

Unnamed: 0,Title,Description,Categories,Category
0,The Big Book of Science Fiction,quite possibly greatest sciencefiction collect...,[Fiction],Fiction
1,The Year's Best Science Fiction: Twenty-Fifth ...,collection best story published 2007,[Fiction],Fiction
2,The Year's Best Science Fiction: Twenty-First ...,story collection imaginatively take reader far...,[Fiction],Fiction
3,The Year's Best Science Fiction: Twenty-Second...,science fiction two dozen outstanding tale wri...,[Fiction],Fiction
4,The Year's Best Science Fiction: Twenty-Eighth...,collect short story exploring theme time space...,[Fiction],Fiction
5,Twenty-First Century Science Fiction,twentyfirst century science fiction enormous a...,[Fiction],Fiction
6,The Year's Best Science Fiction: Thirty-Fifth ...,multiple locus awardwinning annual collection ...,[Fiction],Fiction
7,The Year's Best Science Fiction: Twentieth Ann...,widely regarded one essential book every scien...,[Fiction],Fiction
8,The Seven Beauties of Science Fiction,major critical work one preeminent voice scien...,[Literary Criticism],Literary Criticism
9,Science Fiction and Philosophy,featuring numerous update enhancement science ...,[Philosophy],Philosophy


In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download the necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Define the stop words
stop_words = set(stopwords.words('english'))

# Define the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stop words and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(words)
    return text

# Preprocess the descriptions
df['Description'] = df['Description'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vineethsai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vineethsai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vineethsai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the descriptions
X = vectorizer.fit_transform(df['Description'])

# Convert the matrix to a DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [4]:
# Convert the categories to a format that can be used by the classifier
df['Category'] = df['Categories'].apply(lambda x: x[0] if x else '')

# Encode the categories
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['Category'])

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)

# Initialize the classifier
clf = MultinomialNB()

# Train the classifier
clf.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
# Predict the categories of the test set
y_pred = clf.predict(X_test)

# Print the accuracy score
print("Accuracy:", accuracy_score(y_test, y_pred))

# Get the unique classes in y_test and y_pred
unique_classes = np.unique(np.concatenate((y_test, y_pred)))

# Convert the unique classes to their original labels
unique_classes = le.inverse_transform(unique_classes)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=unique_classes))

Accuracy: 0.5
                    precision    recall  f1-score   support

           Fiction       0.50      1.00      0.67         1
Literary Criticism       0.00      0.00      0.00         1

          accuracy                           0.50         2
         macro avg       0.25      0.50      0.33         2
      weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
def predict_category(description):
    # Preprocess the text
    description = preprocess_text(description)
    # Convert the text to TF-IDF scores
    X = vectorizer.transform([description])
    # Convert the matrix to a DataFrame with feature names
    X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    # Predict the category
    category = clf.predict(X)
    # Convert the category back to its original label
    category = le.inverse_transform(category)
    return category[0]

# Test the function with a book description
description = "A thrilling new novel from the best-selling author of 'The Martian.'"
print("Predicted category:", predict_category(description))

Predicted category: Fiction
