In [None]:
# Necessary library importation 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from PIL import Image
import pandas as pd
from wordcloud import WordCloud
import nltk
import re
import spacy

In [None]:
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.corpus import stopwords
nlp = en_core_web_sm.load()
NER = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#load the data 
df = pd.read_csv("Data/data_science_book.csv")

# Data cleaning 

In [None]:
#cleaning and converting the string into integer and float of these columns for later use 
df["Rating stars"] = df["Rating stars"].apply(lambda x: re.findall("\d+\.\d+", str(x)))
df["Rating stars"] = df["Rating stars"].apply(lambda x: 0 if len(x)==0 else float(x[0]))
df["Rating count"] = df["Rating count"].map(lambda x: str(x).replace(",", ""))
df["Rating count"] = df["Rating count"].apply(lambda x: 0 if len(x)==0 else int(x))

In [None]:
df.head()

In [None]:
# drop the Unnamed: 0 column and rename the misspelled column
df = df.drop("Unnamed: 0", axis = 1)
df = df.rename(columns = {"Nme of book" :"Name of book"})

In [None]:
df = df.drop_duplicates()

In [None]:
# descriptive information
df.info()

In [None]:
df.shape

# Top 20 Highly rated book on Amazon 

In [None]:
df1 = df.nlargest(20, ["Rating count"]) 

In [None]:
list(df1["Name of book"])

In the most rated list of books we can see there are few books not related to data science because when I typed the keywords to search the books, I was not very specific I wrote "Data science book" but we can write very specific keywords to seach the products or anything and then scrape. 

# 20 Highly rated and highest star Books on amazon

In [None]:
df2 = df1.nlargest(20, ["Rating stars"])

In [None]:
list(df2["Name of book"])

In [None]:
#Cleaning the scraped_text
def text_cleaning (input_text):    
    processed_text = str(input_text).lower()
    processed_text = re.sub('[^a-zA-Z]', ' ', processed_text )
    processed_text = re.sub(r'\s+', ' ', processed_text)
    return processed_text
#input the text 
# processed_text = text_cleaning (doc)

# Preparing the text
def prepare_text(processed_text):
    sentences = nltk.sent_tokenize(processed_text)
    words = [nltk.word_tokenize(sent) for sent in sentences]
    return words

# Removing Stop Words
def remove_stopwords(words):
    lst_word = []                   #store all the cleaned dont contain stop words in this variable 
    for i in range(len(words)):
        words[i] = [w for w in words[i] if w not in stopwords.words('english')]
        lst_word.append(words[i])
    return lst_word

In [None]:
#Cleaning, processing and removing stopwords from the name of book
processed_text = [text_cleaning(i) for i in (df["Name of book"])]
words = [nltk.word_tokenize(sent) for sent in processed_text]
filterd= remove_stopwords(words)
def flatten(lst):
    return [item for sublist in lst for item in sublist]
flattened_lst = flatten(filterd)

In [None]:
#Generate wordscloud from name of books 
listToStr = ' '.join(map(str, flattened_lst))
wordcloud = WordCloud(width=1600, height=800,background_color="white").generate(listToStr)
plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

# TF-IDF

TF-IDF (Term Frequency-Inverse Document Frequency) is a way of measuring how relevant a word is to a document in a collection of documents.

1. Term Frequency (TF): how many times a word appears in a document.
2. Inverse Document Frequency (IDF): the inverse document frequency of the word across a collection of documents. Rare words have high scores, common words have low scores.

* Use case of TFIDF:TF-IDF has many uses, such as in information retrieval, text analysis, keyword extraction, and as a way of obtaining numeric features from text for machine learning algorithms.

In [None]:
from IPython.display import Image
Image("Img/tfidf.png")

# One word level Term frequency

In [None]:
#function to calculate the term frequency in the name of the book 
def term_frequency_calculator(lst_words):
    #create an empty dictionary
    data = {}
    for item in lst_words:
        #iterate the count of words over the dictionary
        if item in data:
            data[item]+=1
        else:
            data[item]=1
    word_size = sum(v for k,v in data.items())
    return [(word,freq/word_size) for word,freq in data.items()]

In [None]:
#list of tuples 
lst_tuples = term_frequency_calculator(flattened_lst)
#Create a dataframe from the list of tuples 
tf = pd.DataFrame(lst_tuples, columns = ["Words", "Frequency"])
#tf.head()

In [None]:
fig = plt.figure(figsize =(12,8))
tf_frequnt = tf.sort_values(by=['Frequency'], ascending=False)
tf_frequnt = tf_frequnt.head(20)
sns.barplot(x="Frequency", y="Words", data=tf_frequnt, color="b")

# Ngram level TFIDF (Tri-gram)

In [None]:
text = [' '.join(map(str,i)) for i in filterd]

In [None]:
#Ngrams 
vectorizer = CountVectorizer(ngram_range = (3,3))
X1 = vectorizer.fit_transform(text) 
features = (vectorizer.get_feature_names())
#print("\n\nFeatures : \n", features)
#print("\n\nX1 : \n", X1.toarray())

# Calculate Term frequency using tfidfvecoriser library from sklearn 

In [None]:
#Tfidf vectorizer
vect = TfidfVectorizer(ngram_range = (3,3))
#fit the text to the model
X = vect.fit_transform(text)
#create columns as term and their frequency in different column 
features_rank = list(zip(vect.get_feature_names(), [round(x[0],1) for x in X.sum(axis=0).T.tolist()]))
#create a dataframe 
df = pd.DataFrame(features_rank, columns =['Words', 'Frequency'])
#Sort the values by frequency to plot the most frequent words 
tf = df.sort_values(by=['Frequency'], ascending=True)
tf = tf.tail(20)
fig = plt.figure(figsize =(12,8))
sns.barplot(x="Frequency", y="Words", data=tf,
            label="Total", color="R")