In [47]:
import json 
import pandas as pd 
import pprint
import re
import multiprocessing as mp
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /Users/anavekua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
file_paths = "data/bestsellers_with_abstract_and_genre_3350.json"
with open(file_paths, 'r') as file:
        data = json.load(file)

df = pd.DataFrame(data)
df = df[df['genre'] != 'N/A']
books_political = df[df['genre'] == 'Politics & Current Events']

In [4]:
def text_cleaning(text): 
    
    import re 
    # Remove #1 from, for exemple, #1 NATIONAL BESTSELLER
    remove_hastag1 = re.sub(r'\#\d', '', text)
    # Remove all numbers 
    remove_numbers = re.sub(r'\d+', '', remove_hastag1)
    # Regular expression to match fully uppercase words and words containing uppercase letters
    remove_full_upper = re.sub(r'\b[A-Z]+\b', '', remove_numbers)
    # lowercasing
    lowercased_text = remove_full_upper.lower()
    # remove everything that is not a word character (including letters, digits and underscore) or a blank space. 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    # Remove any sequence of one or more white space by one white space. Removes white spaces at begining and end of word. It also removes '\xa0', Unicode for non-breaking space. 
    remove_white_space = re.sub(r'\s+', ' ', remove_punctuation).strip()

    return (remove_white_space)

def tokenization(text_clean):
    # Tokenization = Breaking down each text into words put in an array based on blank spaces.
    from nltk.tokenize import word_tokenize
    tokenized_text = word_tokenize(text_clean)
    return tokenized_text

def remove_stop_words(abstract_token):
# Stop Words/filtering = Removing irrelevant words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in abstract_token if word not in stopwords]
    return stopwords_removed

# Stemming = Transforming words into their base form
def stemming(abstract_stop_words):
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in abstract_stop_words]
    return stemmed_text

def preprocessing_abstract(abstract):
    abstract_clean = text_cleaning(abstract)
    abstract_token = tokenization(abstract_clean)
    abstract_stop_words = remove_stop_words(abstract_token) 
    abstract_stemming = stemming(abstract_stop_words)
    return abstract_stemming

In [5]:
# Initialize the new column for preprocessed abstracts
books_political['abstract_preprocessed'] = None

# Loop through each abstract, preprocess it, transform list in string, update the DataFrame
for index, abstract in enumerate(books_political['abstract']):
    abstract_preprocessed = preprocessing_abstract(abstract)
    string = ' '.join([str(item)for item in abstract_preprocessed])
    books_political.iloc[index, 7] = string


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_political['abstract_preprocessed'] = None


In [21]:
# Count TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(books_political['abstract_preprocessed'])

# Get words from stopwords array to use as headers
feature_names = vectorizer.get_feature_names_out()

# Combine header titles and weights
books_political['id'] = range(1, len(books_political) + 1)
df_tfidf_prev = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
df_tfidf_prev['id'] = range(1, len(df_tfidf_prev) + 1)

# Merge the original books_political DataFrame with the df_tfidf_prev DataFrame based on 'id'
df_tfidf = pd.merge(books_political, df_tfidf_prev, on='id')


In [57]:
# Get the TF-IDF vector for the first item (index 0)
vector1 = df_tfidf.iloc[0, 9:].values.reshape(1, -1)

# Get the TF-IDF vector for all items except the first item
vectors = df_tfidf.iloc[:, 9:].values

# Calculate cosine similarity between the first item and all other items
cosim = cosine_similarity(vector1, vectors)
cosim = pd.DataFrame(cosim)

# Convert the DataFrame into a one-dimensional array
cosim = cosim.values.flatten()

# Convert the cosine similarity result into a DataFrame
df_cosim = pd.DataFrame(cosim, columns=['COSIM'])

# Combine the TF-IDF array with the cosine similarity result
df_cosim = pd.concat([df_tfidf, df_cosim], axis=1)

In [61]:
df_cosim.sort_values(by = 'COSIM', ascending=False)

Unnamed: 0,title,author_x,publisher,book_uri,buy_links,genre,abstract,abstract_preprocessed,id,abandon,...,zinn,zionism,zombi,zone,zoo,zuccotti,zuck,zuckerberg,émigré,COSIM
0,ASSAULT ON REASON,Albert Gore,Penguin Group (USA) Incorporated,nyt://book/4f0ac908-b9cc-5be8-8e5d-26e3e9ee11dd,https://goto.applebooks.apple/9780143113621?at...,Politics & Current Events,Now With a New Preface and Final Chapter: “Pos...,new prefac final chapter posttruth donald trum...,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000
264,THE ASSAULT ON INTELLIGENCE,Michael V Hayden,Penguin Press,nyt://book/947cc090-0f61-5f06-9c75-8ae85ab956b2,https://goto.applebooks.apple/9780525558583?at...,Politics & Current Events,A blistering critique of the forces threatenin...,blister critiqu forc threaten american intelli...,265,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126556
253,HOW DEMOCRACIES DIE,Steven Levitsky and Daniel Ziblatt,Crown,nyt://book/b4f126e3-d166-50c7-8c4f-bf49b48bd857,https://goto.applebooks.apple/9781524762933?at...,Politics & Current Events,"NEW YORK TIMES BESTSELLER • “Comprehensive, en...",comprehens enlighten terrifyingli timelyth new...,254,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118198
231,UNDERSTANDING TRUMP,Newt Gingrich,Center Street,nyt://book/b561581a-8651-5112-85bf-1ba5e657b339,https://goto.applebooks.apple/9781478923084?at...,Politics & Current Events,Learn how Trump is making America great again ...,learn trump make america great govern media el...,232,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111212
229,DEMOCRACY,Condoleezza Rice,Twelve,nyt://book/1aec283b-6fcc-5e27-a606-bfe786e4fc41,https://goto.applebooks.apple/9781455540181?at...,Politics & Current Events,From the former secretary of state and bestsel...,former secretari state bestsel author sweep lo...,230,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,SECRET WEAPON,Kevin D Freeman,Regnery Publishing,nyt://book/c0850855-fde5-5cea-a32f-e279142d5b18,https://goto.applebooks.apple/9781596983113?at...,Politics & Current Events,A New York Times bestseller!Who’s really to bl...,new york time bestsellerwho realli blame ameri...,117,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007745
227,OLD SCHOOL,Bill O'Reilly and Bruce Feirstein,Holt,nyt://book/2223999d-df2e-5f44-a18e-21f6e07ab3ab,https://goto.applebooks.apple/9781250135797?at...,Politics & Current Events,Old School is in session....You have probably ...,old school sessiony probabl heard term old sch...,228,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007019
126,DO NOT ASK WHAT GOOD WE DO,Robert Draper,Free Press,nyt://book/d078d6de-7f39-5897-b366-3bd4370810ef,https://goto.applebooks.apple/9781451642100?at...,Politics & Current Events,When the Tea Party Came to Town demonstrates R...,tea parti came town demonstr robert draper unc...,127,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006486
47,A SLOBBERING LOVE AFFAIR,Bernard Goldberg,Regnery,nyt://book/bf059719-0ccb-58cd-bcac-02fc86b2b2e3,https://goto.applebooks.apple/9781596980907?at...,Politics & Current Events,"In 2008, the mainstream media crossed a line. ...",mainstream media cross line old liber bia weve...,48,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006301


In [68]:
books_political[(books_political['id'] == 265) | (books_political['id'] == 1)]

Unnamed: 0,title,author,publisher,book_uri,buy_links,genre,abstract,abstract_preprocessed,id
32,ASSAULT ON REASON,Albert Gore,Penguin Group (USA) Incorporated,nyt://book/4f0ac908-b9cc-5be8-8e5d-26e3e9ee11dd,https://goto.applebooks.apple/9780143113621?at...,Politics & Current Events,Now With a New Preface and Final Chapter: “Pos...,new prefac final chapter posttruth donald trum...,1
2443,THE ASSAULT ON INTELLIGENCE,Michael V Hayden,Penguin Press,nyt://book/947cc090-0f61-5f06-9c75-8ae85ab956b2,https://goto.applebooks.apple/9780525558583?at...,Politics & Current Events,A blistering critique of the forces threatenin...,blister critiqu forc threaten american intelli...,265


In [71]:
pd.set_option('display.max_colwidth', None)

books_political.loc[books_political['id'] == 265, 'abstract']


Name: abstract, dtype: object