In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import pickle
import wordcloud
import re
from dateutil.parser import parse

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### **Book info from Goodreads**

In [None]:
book_info = []
iter=0
for line in open('/content/drive/My Drive/book_reviews/goodreads_books_fantasy_paranormal.json', 'r'):
  book_info.append(json.loads(line))
  
book_info_df = pd.DataFrame.from_dict(book_info, orient='columns')



## Loading pickle file

with open("/content/drive/My Drive/english_filtered_book_ids.txt", "rb") as fp:   #UnPickling
  english_filtered_book_ids = pickle.load(fp)

book_info_df = book_info_df.loc[book_info_df.book_id.isin(english_filtered_book_ids)]
del (book_info)
book_info_df["language_code"] = "english_merged"

### **Author info from goodreads**

In [None]:
good_reads_authors = []
iter=0
for line in open('/content/drive/My Drive/book_reviews/goodreads_book_authors.json', 'r'):
  good_reads_authors.append(json.loads(line))
  
good_reads_authors = pd.DataFrame.from_dict(good_reads_authors, orient='columns')

In [None]:
book_info_df["author_id"] = book_info_df.authors.apply(lambda x: x[0]["author_id"] if len(x)>0 else "")

# 2 rows does not have author ids
book_info_df.loc[book_info_df.author_id == ""]

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series,author_id
72992,,1,[659982],US,english_merged,"[{'count': '41', 'name': 'to-read'}, {'count':...",,True,3.89,,[],,,https://www.goodreads.com/book/show/7520314-th...,[],,,,2940000088760.0,,,,https://www.goodreads.com/book/show/7520314-th...,https://s.gr-assets.com/assets/nophoto/book/11...,7520314,1,1025915,The Werewolf of Plonkert,The Werewolf of Plonkert,
211618,,1,[],US,english_merged,"[{'count': '17', 'name': 'to-read'}, {'count':...",B004UKF2MM,True,3.04,B004UKF2MM,"[22609311, 856232, 18815796, 7861365, 18404125...",,,https://www.goodreads.com/book/show/11696786-t...,[],,,,,,,,https://www.goodreads.com/book/show/11696786-t...,https://s.gr-assets.com/assets/nophoto/book/11...,11696786,76,6450316,The Best Ghost Stories,The Best Ghost Stories,


In [None]:
## Now we have Author names for the BOOKREADS dataset

book_info_df = pd.merge(book_info_df, good_reads_authors, how="left", on="author_id")

book_info_df = book_info_df.rename(columns={"name": "author_name"})

## Saving the book_info with author name

In [None]:
book_info_df.to_pickle("/content/drive/My Drive/book_reviews/book_info_with_authors_df.pkl")

## **Extracting amazon book info**

In [None]:
amazon_books = []
iter=0
for line in open('/content/drive/My Drive/amazon_reviews/meta_Books.json', 'r'):
  amazon_books.append(json.loads(line))
  
amazon_books_df = pd.DataFrame.from_dict(amazon_books, orient='columns')

## **Extracting Goodreads book info**

In [None]:
book_info_df = pd.read_pickle("/content/drive/My Drive/book_reviews/book_info_with_authors_df.pkl")

In [None]:
## Adding "amazon_columnname" for all the amazon columns
new_col_list = list()
for col in amazon_books_df.columns.values:
  new_col_list.append("amazon_" + col)

amazon_books_df.columns = new_col_list



## Adding "goodreads_columnname" for all the goodreads columns
new_col_list = list()
for col in book_info_df.columns.values:
  new_col_list.append("goodreads_" + col)

book_info_df.columns = new_col_list

In [None]:
book_info_df["goodreads_title_lower"] = book_info_df.goodreads_title.apply(lambda x: x.lower())
amazon_books_df["amazon_title_lower"] = amazon_books_df.amazon_title.apply(lambda x: x.lower())


def convert_lower_case(x):
  try:
    return x.lower()
  except:
    return x

book_info_df["goodreads_author_name_lower"] = book_info_df.goodreads_author_name.apply(convert_lower_case)
amazon_books_df["amazon_brand_lower"] = amazon_books_df.amazon_brand.apply(convert_lower_case)

In [None]:
book_info_df["goodreads_title_lower"] = book_info_df.goodreads_title.apply(lambda x: x.lower())
amazon_books_df["amazon_title_lower"] = amazon_books_df.amazon_title.apply(lambda x: x.lower())

#common_df = book_info_df.loc[book_info_df.goodreads_title_lower.isin(amazon_books_df.amazon_title_lower)]

common_df = pd.merge(book_info_df, amazon_books_df, left_on="goodreads_title_lower", right_on="amazon_title_lower", how="inner")

In [None]:
common_df.loc[(common_df.goodreads_author_name!= common_df.amazon_brand)][["goodreads_author_name", "amazon_brand", "goodreads_title", "amazon_title"]].sample(5)

Unnamed: 0,goodreads_author_name,amazon_brand,goodreads_title,amazon_title
54473,Sierra Woods,Visit Amazon's Jack O'Connell Page,The Resurrectionist,The Resurrectionist
33754,Chanda Hahn,Visit Amazon's Lauren Burd Page,Forever,Forever
23865,G. Willow Wilson,Visit Amazon's G. Willow Wilson Page,Alif the Unseen,Alif the Unseen
90169,Pierdomenico Baccalario,Visit Amazon's Lawrence Blair Page,Ring of Fire,Ring of Fire
113763,Kim Fielding,John Wardlaw,Phoenix,Phoenix


In [None]:
def check_match(x):
  try:
    if x.goodreads_author_name in x.amazon_brand:
      return True
    return False
  except:
    return "NAN"

common_df["authors_match"] = common_df.apply(check_match, axis=1)

In [None]:
print ("% of match {}".format(common_df.loc[common_df.authors_match == True].shape[0] / book_info_df.shape[0] * 100))

% of match 16.107545000488756


In [None]:
common_df[["goodreads_author_name", "amazon_brand"]]

Unnamed: 0,goodreads_author_name,amazon_brand
0,Lindsey Schussman,Visit Amazon's Lindsey Schussman Page
1,Andrzej Sapkowski,Visit Amazon's Nathan M. Greenfield Page
2,Andrzej Sapkowski,Visit Amazon's Frank Collins Page
3,Andrzej Sapkowski,Visit Amazon's Linda Grant De Pauw Page
4,Andrzej Sapkowski,Visit Amazon's Nathan M. Greenfield Page
...,...,...
131107,Jessica Day George,Visit Amazon's Jessica Day George Page
131108,Jessica Day George,Visit Amazon's Jessica Day George Page
131109,S.J. West,Visit Amazon's S.J. West Page
131110,David Borgenicht,Visit Amazon's David Borgenicht Page


### Saving common amazon and goodreads book info

In [None]:
book_info_with_authors_df_common = book_info_df.loc[book_info_df.goodreads_author_name.isin(common_df.goodreads_author_name) & (book_info_df.goodreads_title_lower.isin(common_df.goodreads_title_lower))]
book_info_df_filtered.to_pickle("/content/drive/My Drive/book_reviews/book_info_with_authors_df_common.pkl")

In [None]:
amazon_books_df_common = amazon_books_df.loc[amazon_books_df.amazon_brand.isin(common_df.amazon_brand) & (amazon_books_df.amazon_title.isin(common_df.amazon_title))]
amazon_books_df_common.to_pickle("/content/drive/My Drive/amazon_reviews/amazon_books_df_common.pkl")