## Book recommandation system

In [237]:
import json 
import pandas as pd 
import pprint
import re
import multiprocessing as mp
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /Users/anavekua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Data books

In [240]:
# Load books data
file_paths = "data/bestsellers_with_abstract_and_genre_3350.json"
with open(file_paths, 'r') as file:
        data = json.load(file)

df = pd.DataFrame(data)

In [241]:
# Books per category
grouped_df = df.groupby('genre').count()
print(grouped_df['title'])

genre
Arts & Entertainment             55
Biographies & Memoirs          1237
Business & Personal Finance     166
Comics & Graphic Novels           1
Computers & Internet              3
Cookbooks, Food & Wine            4
Fiction & Literature             32
Health, Mind & Body             113
History                         387
Humor                           121
January 30                        1
Kids                              1
Lifestyle & Home                 22
Mysteries & Thrillers             1
N/A                             203
Nonfiction                      199
Parenting                        22
Politics & Current Events       432
Professional & Technical         48
Reference                         2
Religion & Spirituality          76
Romance                           3
Sci-Fi & Fantasy                  1
Science & Nature                119
Sports & Outdoors                80
Travel & Adventure               19
Young Adult                       2
Name: title, dtype: in

In [185]:
# Remove N/A category
df = df[df['genre'] != 'N/A']

# Select only books in 'Politics & Current Events'
corpus_book = df[df['genre'] == 'Politics & Current Events']

#### Data Articles


In [242]:
# Load data articles archives
file_paths = "/Users/anavekua/Documents/DataScienTest/API_nyt_json/nytimes_archives_elections_articles.json"

with open(file_paths, 'r') as file:
        data = json.load(file)

articles = pd.DataFrame(data)

In [245]:
#Load an article from archive
a = 100000 # article number
article = articles.iloc[a, :]
print("Headline:", article['headline_main'])
pprint.pprint(article['abstract'])

Headline: Republicans Champion ‘Voluntary Taxes’
('Republicans will do anything to avoid raising taxes, and their latest bill '
 'to encourage voluntary contributions is nothing more than a distraction, an '
 'economist writes.')


#### Data combination

In [246]:
# Combine all books abstracts with article's abstract
corpus_article = article['abstract']
corpus_books_article = pd.DataFrame(corpus_book['abstract'])
corpus_books_article.loc[-1] = corpus_article # add article's abstract at the end of the df
corpus_books_article.tail(3)

Unnamed: 0,abstract
3344,"NEW YORK TIMES BESTSELLER • From bestselling author and longtime New York Times columnist Frank Bruni comes a lucid, powerful examination of the ways in which grievance has come to define our current culture and politics, on both the right and left.The twists and turns of American politics are unpredictable, but the tone is a troubling given. It’s one of grievance. More and more Americans are convinced that they’re losing because somebody else is winning. More and more tally their slights, measure their misfortune, and assign particular people responsibility for it. The blame game has become the country’s most popular sport and victimhood its most fashionable garb.Grievance needn’t be bad. It has done enormous good. The United States is a nation born of grievance, and across the nearly two hundred and fifty years of our existence as a country, grievance has been the engine of morally urgent change. But what happens when all sorts of grievances—the greater ones, the lesser ones, the authentic, the invented—are jumbled together? When people take their grievances to lengths that they didn’t before? A violent mob storms the US Capitol, rejecting the results of a presidential election. Conspiracy theories flourish. Fox News knowingly peddles lies in the service of profit. College students chase away speakers, and college administrators dismiss instructors for dissenting from progressive orthodoxy. Benign words are branded hurtful; benign gestures are deemed hostile. And there’s a potentially devastating erosion of the civility, common ground, and compromise necessary for our democracy to survive.How did we get here? What does it say about us, and where does it leave us? The Age of Grievance examines these critical questions and charts a path forward."
3349,"A NEW YORK TIMES, USA TODAY BESTSELLER! The New York Times bestselling author, governor of South Dakota, and former congresswoman tells eye-opening stories of DC dysfunction, shares lessons from leading her state through unprecedented challenge, and explains how we seize this moment to move America forward. Any elected official can talk about how broken our government is. But their solutions always seem to involve more money, new programs—and reelection to another term. Few offer an unfiltered glimpse into how government actually works, empowering citizens with the knowledge to be part of the solution. Governor Kristi Noem never planned on being in politics. But her concern for our nation compelled her, on a local, national, and global level. Because she took a different path into public service, as a concerned mom and rancher, her insights help every citizen understand how positive change really happens, despite the dysfunction in Washington DC. Governor Noem explains how the country is not going back to the Republican party of the 2000s. And that’s a good thing. This book is packed with surprising stories and practical lessons from the front lines of the battle. And she names names. ​ A lot has changed since 2016, and based on her accomplishments in Congress and as Governor, no one is better equipped than Kristi Noem to explain the tremendous opportunities this opens up for every American."
-1,"Republicans will do anything to avoid raising taxes, and their latest bill to encourage voluntary contributions is nothing more than a distraction, an economist writes."


# TF-IDF and similarity scores

### Data preprocessing

In [247]:
def text_cleaning(text): 
    
    import re 
    # Remove #1 from, for exemple, #1 NATIONAL BESTSELLER
    remove_hastag1 = re.sub(r'\#\d', '', text)
    # Remove all numbers 
    remove_numbers = re.sub(r'\d+', '', remove_hastag1)
    # Regular expression to match fully uppercase words and words containing uppercase letters
    remove_full_upper = re.sub(r'\b[A-Z]+\b', '', remove_numbers)
    # lowercasing
    lowercased_text = remove_full_upper.lower()
    # remove everything that is not a word character (including letters, digits and underscore) or a blank space. 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    # Remove any sequence of one or more white space by one white space. Removes white spaces at begining and end of word. It also removes '\xa0', Unicode for non-breaking space. 
    remove_white_space = re.sub(r'\s+', ' ', remove_punctuation).strip()

    return (remove_white_space)

def tokenization(text_clean):
    # Tokenization = Breaking down each text into words put in an array based on blank spaces.
    from nltk.tokenize import word_tokenize
    tokenized_text = word_tokenize(text_clean)
    return tokenized_text

def remove_stop_words(abstract_token):
# Stop Words/filtering = Removing irrelevant words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in abstract_token if word not in stopwords]
    return stopwords_removed

# Stemming = Transforming words into their base form
def stemming(abstract_stop_words):
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in abstract_stop_words]
    return stemmed_text

def preprocessing_abstract(abstract):
    abstract_clean = text_cleaning(abstract)
    abstract_token = tokenization(abstract_clean)
    abstract_stop_words = remove_stop_words(abstract_token) 
    abstract_stemming = stemming(abstract_stop_words)
    return abstract_stemming

In [253]:
corpus_books_article['abstract_preprocessed'] = None
col_index = corpus_books_article.columns.get_loc('abstract_preprocessed')

# Loop through each abstract, preprocess it, transform list in string, update the DataFrame
for index, abstract in enumerate(corpus_books_article['abstract']):
    abstract_preprocessed = preprocessing_abstract(abstract)
    string = ' '.join([str(item)for item in abstract_preprocessed])
    corpus_books_article.iloc[index, col_index] = string

### TF - IDF computation

In [254]:
# Count TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus_books_article['abstract_preprocessed']) # return a document-term matrix

# Get words 
feature_names = vectorizer.get_feature_names_out()

# Combine corpus with the weighted word matrix by creating 'id' index and merge
corpus_books_article['id'] = range(0, len(corpus_books_article))
df_tfidf_prev = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
df_tfidf_prev['id'] = range(0, len(df_tfidf_prev))
df_tfidf = pd.merge(corpus_books_article, df_tfidf_prev, on='id')

In [255]:
def get_top_10_most_important_words(row_num):
    """
    return :
    - the 10 top words with the highest weight based of a book abstract based on the row number
    - the abstract
    """
    abstract = df_tfidf.iloc[row_num, df_tfidf.columns.get_loc("abstract")]
    row = df_tfidf.iloc[row_num, df_tfidf.columns.get_loc("id"):]
    row_sort = row[1:].sort_values(ascending = False)
    top_10_words = row_sort[:10]
    return print(top_10_words), pprint.pprint(abstract)

### Cosine similarity computation

In [256]:
# Computatuon of Cosine similarity. The higher the cosim value, the more similar the elements are. 
cosim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosim = pd.DataFrame(cosim)
cosim_article = cosim.tail(1)

### Books to article selection

In [279]:
# Select the last row of the cosine matrix
cosim_artile_sort = cosim_article.iloc[-1,:].sort_values(ascending = False)

# Select the 5 books that are the most similare to the article
top_5_books = cosim_artile_sort[1:5]
top_5_books = pd.DataFrame(top_5_books)
top_5_books.columns = ['cosine']
top_5_books = top_5_books.reset_index()
top_5_books

Unnamed: 0,index,cosine
0,230,0.075591
1,101,0.062915
2,403,0.060432
3,335,0.058875
