In [21]:
import csv
import re
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
import time
from glob import glob
from multiprocessing import Pool
from amazon.api import AmazonAPI
from secrets import AMAZON
%matplotlib inline

In [15]:
BOOK_COMMENTS_CSV_PATH = os.path.join('/Users', 'wcwagner', 'projects', 'readersindex', 
                                      'data', 'book_comments.csv')
COLUMNS = [
    'id', 'created_utc', 'subreddit', 'subreddit_id', 'author',
    'url', 'product_id', 'score', 'link_id', 'parent_id', 'body'
]

# set up Amazon Product Advertising Api object
amzn = AmazonAPI(AMAZON['PAAPI_ACCESS_KEY'], AMAZON['PAAPI_SECRET_KEY'],
                 AMAZON['ASSOCIATE_TAG'])

In [17]:
df = pd.read_csv(BOOK_COMMENTS_CSV_PATH, names=COLUMNS)
df[:5]

Unnamed: 0,id,created_utc,subreddit,subreddit_id,author,url,product_id,score,link_id,parent_id,body
0,d4c1rmg,1466106790,nsfw,t5_vf2,Delet3r,amazon.com/Free-Will-Sam-Harris/dp/1451683405,1451683405,0.0,t3_4odc4j,t1_d4c0vmr,Dude... people don't make free choices. I mean...
1,da86yyx,1479654123,de,t5_22i0,crzdr1683,amazon.com/Call-Mild-Learning-Hunt-Dinner/dp/1...,1455500747,2.0,t3_5dsj0a,t1_da83e6r,"Ich habe noch keinen Namen dafür gehört, aller..."
2,dbsmhnt,1483107238,de,t5_22i0,not_perfect_yet,amazon.com/PR-Social-History-Stuart-Ewen/dp/04...,465061796,1.0,t3_5l2y2l,t3_5l2y2l,Knigge ist interessant. Dürrenmatt finde ich l...
3,dhub3nj,1495361396,de,t5_22i0,Drenmar,amazon.com/Homo-Deus-Brief-History-Tomorrow/dp...,62464310,5.0,t3_6cft3x,t3_6cft3x,Wer das Thema in Buchform haben will kann sich...
4,daw41pz,1481101071,de,t5_22i0,everestmntntop,amazon.com/Jesus-Eyewitnesses-Gospels-Eyewitne...,310339308,-6.0,t3_5gvqfw,t1_daw3s5f,&gt; Fordert Nachweise über irgendwas\n\nStimm...


In [42]:
# Gets titles for list of isbns
def add_titles(df):
    ids = df.product_id
    titles = []
    for id_ in ids:
        try:
            prod = amzn.lookup(ItemId=id_)
        except:
            print('Couldnt get {0}'.format(id_))
            titles.append("N/A")
            continue
        titles.append(prod.title)
        time.sleep(1.5)
    df['title'] = titles
    return df

In [18]:
grouped_asin = df.groupby('product_id') \
.agg({'url': 'count', 'score': 'sum', 'id': 'first' }) \
.reset_index() \
.rename(columns={'url': 'Url Count', 'score': 'Cumulative Score', 'id': 'Sample ID'})

#### Top books by number of mentions, non-unique

In [38]:
top_by_mentions = grouped_asin.sort_values('Url Count', ascending=False)[:15]
add_titles(top_by_mentions)

Unnamed: 0,product_id,Url Count,Cumulative Score,Sample ID,title
45023,786965592,532,2114.0,d1mz6uz,Dungeons & Dragons Starter Set
12833,273785370,414,907.0,dj1bwvr,Smarter Investing (Financial Times)
1364,60881909,404,1211.0,czisesr,"Taking Charge of Your Fertility, 10th Annivers..."
19724,380810336,396,1097.0,dj57061,Feeling Good: The New Mood Therapy
5563,132350882,394,2419.0,d9ws40g,Clean Code: A Handbook of Agile Software Craft...
33684,671027034,362,1808.0,dkwfv8e,How to Win Friends & Influence People
54487,879756071,335,1065.0,d1szdhb,"Maybe Yes, Maybe No: A Guide for Young Skeptics"
41155,762415339,302,1350.0,d221z7b,No More Mr Nice Guy: A Proven Plan for Getting...
61113,982514379,288,1637.0,dirz9rd,Where's MY Book? A Guide for Transgender and G...
75190,1481227041,274,1943.0,d9lt4gp,Natural Harvest: A collection of semen-based r...


#### Top books by number of mentions, unique

In [49]:
top_by_mentions_unique = df.groupby('product_id') \
    .agg({'author': 'nunique', 'score': 'sum', 'id': 'first' }) \
    .reset_index() \
    .rename(columns={'author': 'Num unique', 'score': 'Cumulative Score', 'id': 'Sample ID'}) \
    .sort_values('Num unique', ascending=False)
add_titles(top_by_mentions_unique[:15])

Couldnt get 0870334336


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,product_id,Num unique,Cumulative Score,Sample ID,title
33684,0671027034,342,1808.0,dkwfv8e,How to Win Friends & Influence People
45023,0786965592,327,2114.0,d1mz6uz,Dungeons & Dragons Starter Set
5563,0132350882,279,2419.0,d9ws40g,Clean Code: A Handbook of Agile Software Craft...
75190,1481227041,251,1943.0,d9lt4gp,Natural Harvest: A collection of semen-based r...
19724,0380810336,249,1097.0,dj57061,Feeling Good: The New Mood Therapy
41155,0762415339,203,1350.0,d221z7b,No More Mr Nice Guy: A Proven Plan for Getting...
38093,0735611319,195,1137.0,d23n961,Code: The Hidden Language of Computer Hardware...
61117,0982522738,190,2238.0,dcq2eid,"Starting Strength: Basic Barbell Training, 3r..."
53162,0870334336,184,3335.0,ddn7pgr,
87011,1594035229,184,1092.0,d00b9l3,Three Felonies A Day: How the Feds Target the ...


In [43]:
top_by_score = grouped_asin.sort_values('Cumulative Score', ascending=False)[:15]
add_titles(top_by_score)

Couldnt get 0870334336


Unnamed: 0,product_id,Url Count,Cumulative Score,Sample ID,title
19077,0374533555,178,15598.0,d9fgd4m,"Thinking, Fast and Slow"
73990,1468579339,2,10132.0,dhv5oco,Here I Am! Who are You?: Resolving Conflicts B...
102962,193746007X,4,7721.0,dqjl3vs,Law and Government: An Introductory Study Course
7642,0143114247,6,5187.0,dio8bq4,The Stuff of Thought: Language as a Window int...
29287,0521639905,5,4614.0,crcx5bp,The Measure of Reality: Quantification and Wes...
23430,0425153436,2,4445.0,ddo2d91,Our Children Forever: George Anderson's Messag...
51075,0825307465,2,4129.0,cvfr571,You Look Like That Girl: A Child Actor Stops P...
25222,0452290082,6,3987.0,d6u9rvy,Banana: The Fate of the Fruit That Changed the...
31268,0553804723,39,3658.0,cwppajv,The Definitive Book of Body Language: The Hidd...
1431,0060919906,3,3655.0,df4z0db,"Our Kind: Who We Are, Where We Came From, Wher..."


In [56]:
len(df['product_id'].unique())

109838