Don't use any functions from this file. Use RecommendGenres.py

#### Data Processing and Cleaning

In [2]:
import pandas as pd

In [2]:
df = pd.read_csv('goodreads_data.csv')

In [3]:
print(df['Genres'].value_counts())

Genres
[]                                                                                                                     960
['Fiction']                                                                                                             49
['Fantasy']                                                                                                             42
['Nonfiction']                                                                                                          24
['Romance']                                                                                                             20
                                                                                                                      ... 
['Nonfiction', 'Self Help', 'Personal Development', 'Audiobook', 'Psychology', 'Christian', 'Philosophy']                1
['Nonfiction', 'Epic']                                                                                                   1
['Young A

In [4]:
df.drop('uselessNumbers', axis=1, inplace=True)
df.drop('Num_Ratings', axis=1, inplace=True)
df.drop('URL', axis=1, inplace=True)

In [5]:
df = df[df['Genres'].astype(str) != '[]']

In [6]:
df = df.reset_index(drop=True)

In [7]:
df

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",4.27
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical...",4.28
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp...",4.18
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',...",3.98
...,...,...,...,...,...
9035,Call To Crusade,Tom Vetter,"In Call To Crusade, Tom Vetter begins the Sieg...","['Historical Fiction', 'Historical']",4.56
9036,"Die Känguru-Chroniken (Die Känguru-Chroniken, #1)",Marc-Uwe Kling,"""Kannst du heute mal bezahlen?"", fragt das Kän...","['Humor', 'Fiction', 'Audiobook', 'German Lite...",4.30
9037,"Breeders (Breeders Trilogy, #1)",Ashley Quigley,How far would you go? If human society was gen...,"['Dystopia', 'Science Fiction', 'Post Apocalyp...",3.44
9038,The Republic of Trees,Sam Taylor,This dark fable tells the story of four Englis...,"['Fiction', 'Horror', 'Dystopia', 'Coming Of A...",3.29


In [8]:
df.to_csv('cleaned_goodreads_data.csv', index=False)

#### Making the Model

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [10]:
# Text Vectorization

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Genres']) 

In [11]:
# Training Model

model = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
model.fit(X)



In [12]:
# Predictions

def recommend_genres(genre):
    query = vectorizer.transform([genre])
    distances, indices = model.kneighbors(query)
    return df['Genres'].iloc[indices[0]] 

In [14]:
# Test

print(recommend_genres('Dystopia'))

6904    ['Dystopia']
6114    ['Dystopia']
478     ['Dystopia']
6034    ['Dystopia']
6499    ['Dystopia']
Name: Genres, dtype: object


## Fixing descriptions

In [3]:
df1 = pd.read_csv('goodreads_data.csv')

def truncate_description(description):
    if isinstance(description, str) and len(description) > 380:
        return description[:380] + "..."
    else:
        return description

# Apply the function to each description
df1['Description'] = df1['Description'].apply(truncate_description)

In [7]:
# Save the modified dataframe to a new CSV file
df1.to_csv('data_with_shorter_descriptions.csv', index=False)