In [32]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
import gzip
import json
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import itertools
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter, defaultdict
%matplotlib inline

In [2]:
user_clustid = pd.read_csv('user_clustid_k25.csv')
ratings_books = pd.read_csv('ratings_books_u80_b10.csv')
usermap = pd.read_csv('user_id_map.csv')

In [3]:
ratings_books.head(5)

Unnamed: 0,user_id,is_read,rating,is_reviewed,book_id_gr,user_counts,book_counts,user_idx,book_idx
0,1073,1,3,1,47970,146,157,0,624
1,1073,1,4,1,304687,146,49,0,1867
2,1073,1,3,1,149267,146,1954,0,1322
3,1073,1,3,1,140098,146,136,0,1276
4,1073,1,3,1,1773616,146,71,0,3473


In [4]:
len(set(ratings_books['user_id']))

787

In [33]:
#set(ratings_books['user_id'])

In [5]:
usermap.head(4)

Unnamed: 0,user_id_csv,user_id
0,0,8842281e1d1347389f2ab93d60773d4d
1,1,72fb0d0087d28c832f15776b0d936598
2,2,ab2923b738ea3082f5f3efcbbfacb218
3,3,d986f354a045ffb91234e4af4d1b12fd


In [6]:
user_dict = usermap.set_index('user_id_csv').T.to_dict('list')
ratings_books['user_id_gr'] = ratings_books['user_id'].map(user_dict)

In [7]:
usermap[usermap['user_id_csv'] == 1272]

Unnamed: 0,user_id_csv,user_id
1272,1272,8bf745a1e2b3ec721ad079990111f114


In [8]:
ratings_books['user_id_gr'] = ratings_books['user_id_gr'].str.get(0)

In [29]:
# ratings_books[ratings_books['user_id'] == 1272]

In [9]:
ratings_books.head(5)

Unnamed: 0,user_id,is_read,rating,is_reviewed,book_id_gr,user_counts,book_counts,user_idx,book_idx,user_id_gr
0,1073,1,3,1,47970,146,157,0,624,b5f937e40b9cb0e2eecb4ba2365e2019
1,1073,1,4,1,304687,146,49,0,1867,b5f937e40b9cb0e2eecb4ba2365e2019
2,1073,1,3,1,149267,146,1954,0,1322,b5f937e40b9cb0e2eecb4ba2365e2019
3,1073,1,3,1,140098,146,136,0,1276,b5f937e40b9cb0e2eecb4ba2365e2019
4,1073,1,3,1,1773616,146,71,0,3473,b5f937e40b9cb0e2eecb4ba2365e2019


In [10]:
idx_idgr = ratings_books.drop(ratings_books.columns[[1, 2, 3, 5, 6]], axis=1)

In [11]:
idx_idgr.head(4)

Unnamed: 0,user_id,book_id_gr,user_idx,book_idx,user_id_gr
0,1073,47970,0,624,b5f937e40b9cb0e2eecb4ba2365e2019
1,1073,304687,0,1867,b5f937e40b9cb0e2eecb4ba2365e2019
2,1073,149267,0,1322,b5f937e40b9cb0e2eecb4ba2365e2019
3,1073,140098,0,1276,b5f937e40b9cb0e2eecb4ba2365e2019


In [12]:
list_user_gr = list(set(idx_idgr['user_id_gr']))

In [36]:
list_book_gr = list(set(idx_idgr['book_id_gr']))

In [37]:
len(list_book_gr)

10268

In [13]:
len(list_user_gr)

787

In [15]:
# Some useful methods related to json files

# Reading the input json file
def load_data(inputdata):
    #books = []
    with gzip.open(inputdata) as json_file:
        for line in json_file:
            book = json.loads(line)
            books.append(book)
    return books

# Extracting subset of the data
def split_data(inputdata, cutoff):
    books = []
    count = 0
    with gzip.open(inputdata) as json_file:
        for line in json_file:
            if count <= cutoff:
                book = json.loads(line)
                count += 1
                books.append(book)
    return books

# get data needed from json
def get_reviews_data(inputdata, list_users):
    reviews = []
    with gzip.open(inputdata) as json_file:
        for lines in json_file:
            review = {}
            line = json.loads(lines)
            if line['user_id'] in list_user_gr:
                review['book_id'] = line['book_id']
                review['user_id'] = line['user_id']
                review['rating'] = line['rating']
                review['review_text'] = line['review_text']
                reviews.append(review)
    return reviews



In [16]:
reviews = get_reviews_data('goodreads_reviews_dedup.json.gz', list_user_gr)

In [17]:
reviews[0]

{'book_id': '399550',
 'user_id': 'b5f937e40b9cb0e2eecb4ba2365e2019',
 'rating': 2,
 'review_text': 'I think Claremont had too big of a nerdgasm writing this ... seemed like he was trying to pull a lot of cannonical storylines together into a somewhat uneven mess.'}

In [18]:
len(reviews)

909713

In [19]:
dfreviews = pd.DataFrame(reviews)

In [21]:
dfreviews[dfreviews['user_id'] == 'b5f937e40b9cb0e2eecb4ba2365e2019'].head()

Unnamed: 0,book_id,rating,review_text,user_id
0,399550,2,I think Claremont had too big of a nerdgasm wr...,b5f937e40b9cb0e2eecb4ba2365e2019
1,30896651,3,I really think this series is the best Star Tr...,b5f937e40b9cb0e2eecb4ba2365e2019
2,32052243,3,"3 story, 5 presentation!",b5f937e40b9cb0e2eecb4ba2365e2019
3,17333340,2,I'm sure much of this volume can be written of...,b5f937e40b9cb0e2eecb4ba2365e2019
4,32761530,3,Zing-y! I think Visaggio has a way with the qu...,b5f937e40b9cb0e2eecb4ba2365e2019


In [41]:
bookssf = pd.read_csv('books-authors.csv')

In [42]:
bookssf.head(4)

Unnamed: 0,author,average_rating,book_id,count_shelves,isbn,num_pages,popular_shelves,publication_year,ratings_count,title
0,604031.0,4.0,5333265,"['3', '1', '1', '1', '1']",312853122,256.0,"['to-read', 'p', 'collection', 'w-c-fields', '...",1984.0,3.0,W.C. Fields: A Life on Film
1,626222.0,3.23,1333909,"['2634', '160', '92', '25', '22', '19', '17', ...",743509986,,"['to-read', 'fiction', 'currently-reading', 'c...",2001.0,10.0,Good Harbor
2,149918.0,3.4,287140,"['32', '3', '2', '2', '1', '1', '1', '1', '1',...",850308712,,"['to-read', 'runes', 'owned', 'nonfiction', 'k...",,15.0,Runic Astrology: Starcraft and Timekeeping in ...
3,3041852.0,4.13,287141,"['56', '10', '4', '3', '2', '2', '2', '2', '2'...",1599150603,162.0,"['to-read', 'currently-reading', 'history', 'c...",2006.0,46.0,The Aeneid for Boys and Girls


In [47]:
user_book = bookssf[bookssf['book_id'].isin(list_book_gr)]

In [48]:
len(set(test['book_id']))

10268

In [51]:
type(dfreviews.iloc[0][0])

str

In [52]:
user_book['book_id'] = user_book['book_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
#user_book_rev = dfreviews[dfreviews['book_id'].isin(list_book_gr)]
u_b_r = pd.merge(user_book, dfreviews, on='book_id')

In [40]:
dfreviews.head(4)

Unnamed: 0,book_id,rating,review_text,user_id
0,399550,2,I think Claremont had too big of a nerdgasm wr...,b5f937e40b9cb0e2eecb4ba2365e2019
1,30896651,3,I really think this series is the best Star Tr...,b5f937e40b9cb0e2eecb4ba2365e2019
2,32052243,3,"3 story, 5 presentation!",b5f937e40b9cb0e2eecb4ba2365e2019
3,17333340,2,I'm sure much of this volume can be written of...,b5f937e40b9cb0e2eecb4ba2365e2019


In [54]:
u_b_r.head(4)

Unnamed: 0,author,average_rating,book_id,count_shelves,isbn,num_pages,popular_shelves,publication_year,ratings_count,title,rating,review_text,user_id
0,20560.0,3.75,89583,"['550', '89', '57', '42', '31', '20', '16', '1...",380791986,544.0,"['to-read', 'science-fiction', 'sci-fi', 'fant...",1999.0,1321.0,Six Moon Dance,4,Anyone familiar with Ms. Tepper's works is fam...,ca1c301fff032671a4fd555429db2298
1,20560.0,3.75,89583,"['550', '89', '57', '42', '31', '20', '16', '1...",380791986,544.0,"['to-read', 'science-fiction', 'sci-fi', 'fant...",1999.0,1321.0,Six Moon Dance,2,When I discovered Sheri S. Tepper in high scho...,fe7d2f1ecc244521ef709c1e7e8cfadd
2,20560.0,3.75,89583,"['550', '89', '57', '42', '31', '20', '16', '1...",380791986,544.0,"['to-read', 'science-fiction', 'sci-fi', 'fant...",1999.0,1321.0,Six Moon Dance,4,"One of Tepper's better books, I think. And it ...",5c6a039f39fe09545af84d817af170c9
3,20560.0,3.75,89583,"['550', '89', '57', '42', '31', '20', '16', '1...",380791986,544.0,"['to-read', 'science-fiction', 'sci-fi', 'fant...",1999.0,1321.0,Six Moon Dance,4,"I had read this before, as I realised after th...",8de32c077954e6080a94f2e2ced6116f


In [55]:
ubr_filtered = u_b_r.drop(u_b_r.columns[[3, 4, 5, 6, 7, 8, 10]], axis=1)

In [56]:
ubr_filtered.head(4)

Unnamed: 0,author,average_rating,book_id,title,review_text,user_id
0,20560.0,3.75,89583,Six Moon Dance,Anyone familiar with Ms. Tepper's works is fam...,ca1c301fff032671a4fd555429db2298
1,20560.0,3.75,89583,Six Moon Dance,When I discovered Sheri S. Tepper in high scho...,fe7d2f1ecc244521ef709c1e7e8cfadd
2,20560.0,3.75,89583,Six Moon Dance,"One of Tepper's better books, I think. And it ...",5c6a039f39fe09545af84d817af170c9
3,20560.0,3.75,89583,Six Moon Dance,"I had read this before, as I realised after th...",8de32c077954e6080a94f2e2ced6116f


In [63]:
ubr_filtered.to_csv('ubr_filtered.csv', index=False)

In [59]:
bookssf[bookssf['book_id'] == 89583] # for testing later

Unnamed: 0,author,average_rating,book_id,count_shelves,isbn,num_pages,popular_shelves,publication_year,ratings_count,title
152,20560.0,3.75,89583,"['550', '89', '57', '42', '31', '20', '16', '1...",380791986,544.0,"['to-read', 'science-fiction', 'sci-fi', 'fant...",1999.0,1321.0,Six Moon Dance


In [60]:
ubr_filtered.shape

(66472, 6)

In [61]:
analyser = SentimentIntensityAnalyzer()

def sentiment_scores(dfin, analyser):
    user_sentiment = defaultdict(list)
    for i in range(dfin.shape[0]):
        sentiment_dict = analyser.polarity_scores(dfin.iloc[i][4])
        user_sentiment[dfin.iloc[i][5]].append(sentiment_dict['compound'])
    
    return user_sentiment
            


In [67]:
user_sentiment = sentiment_scores(ubr_filtered, analyser)

In [73]:
def get_average_sentiment(inputdict):
    outputdict = {}
    for k,v in inputdict.items():
        outputdict[k] = np.mean(v)
    
    return outputdict
    

In [74]:
user_sentiment_av = get_average_sentiment(user_sentiment)

In [72]:
np.mean(user_sentiment['ca1c301fff032671a4fd555429db2298'])

0.684157731958763

In [75]:
user_sentiment_av['ca1c301fff032671a4fd555429db2298']

0.684157731958763

In [87]:
def get_nature_avsent(inputdict):
    outputdict = {}
    for k, v in inputdict.items():
        if v >= 0.05:
            outputdict[k] = 'positive'
        elif v <= -0.05:
            outputdict[k] = 'negative'
        else:
            outputdict[k] = 'neutral'
            
    return outputdict

In [88]:
user_sent_nature = get_nature_avsent(user_sentiment_av)

In [89]:
user_sent = pd.DataFrame(zip(list(user_sent_nature.keys()), list(user_sent_nature.values())), columns=['User', 'Average sentiment'])

In [90]:
user_sent.head(4)

Unnamed: 0,User,Average sentiment
0,ca1c301fff032671a4fd555429db2298,positive
1,fe7d2f1ecc244521ef709c1e7e8cfadd,positive
2,5c6a039f39fe09545af84d817af170c9,positive
3,8de32c077954e6080a94f2e2ced6116f,positive


In [92]:
user_sent['Average sentiment'].value_counts()

positive    759
neutral      15
negative     12
Name: Average sentiment, dtype: int64