# Install packages

In [3]:
# Install your required packages here
!pip install --user pandas numpy matplotlib sklearn



In [4]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import numpy as np
import pandas as pd
import sklearn
import gzip
import json
from tqdm import tqdm
import os
from collections import Counter
from datetime import datetime
import math
tqdm.pandas() #for progres_apply etc.

In [6]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
  #read gzipped content
  f=gzip.open(filename_gzipped_python_json,'r')
  
  #parse json
  parse_data = []
  for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
    line = line.decode('utf-8')
    line = line.replace('true','True') #difference json/python
    line = line.replace('false','False')
    parsed_result = eval(line) #load python nested datastructure
    parse_data.append(parsed_result)
    if read_max !=-1 and len(parse_data) > read_max:
      print(f'Break reading after {read_max} records')
      break
  print(f"Reading {len(parse_data)} rows.")

  #create dataframe
  df= pd.DataFrame.from_dict(parse_data)
  return df


# 1. Load Goodreads data

In [7]:
goodreads_path = '/content/drive/MyDrive/AI Project/datasets/Goodreads/'
books = 'goodreads_books_comics_graphic.json.gz'
interactions = 'goodreads_interactions_comics_graphic.json.gz'
reviews = 'goodreads_reviews_comics_graphic.json.gz'

In [8]:
for dataset in [books, interactions, reviews]:
  print(f"----- {dataset}-----")
  size = os.path.getsize(goodreads_path + dataset) 
  print(f'Size of file is {size / 1000000}MB')
  #df = pd.read_json(goodreads_path + dataset, lines=True, nrows=1000)
  df = parse_json(goodreads_path + dataset, read_max=100000)
  pd.set_option('display.max_colwidth', None)
  display(df.head(5))

----- goodreads_books_comics_graphic.json.gz-----
Size of file is 68.39151MB


89411it [01:03, 1416.75it/s]


Reading 89411 rows.


Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,similar_books,description,format,link,authors,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,,"[{'count': '228', 'name': 'to-read'}, {'count': '2', 'name': 'graphic-novels'}, {'count': '1', 'name': 'ff-re-2011-till-2015'}, {'count': '1', 'name': 'calibre-list'}, {'count': '1', 'name': 'linseyschussan'}, {'count': '1', 'name': '1-person-narrative'}, {'count': '1', 'name': 'lgbtq-ya'}, {'count': '1', 'name': 'watchlist'}, {'count': '1', 'name': 'next-to-read'}, {'count': '1', 'name': 'sf'}, {'count': '1', 'name': 'sachiko'}, {'count': '1', 'name': 'giveaway-add'}, {'count': '1', 'name': 'friends-in-mind'}, {'count': '1', 'name': 'free-to-read-or-preview-on-goodread'}, {'count': '1', 'name': 'fantasy'}, {'count': '1', 'name': 'dystopian'}, {'count': '1', 'name': 'ck-library'}, {'count': '1', 'name': '23089-ya-fantasy-sf-w-major-lgbt'}]",B00NLXQ534,True,4.12,,"[25653153, 25699172, 23530486, 12984185, 25538377, 23525552, 18215952, 21412122, 25758901]","Lillian Ann Cross is forced to live the worst nightmare of her life. She is an everyday middle class American, striving to survive in an everyday changing world. Her life was abruptly\nturned upsidedown forever as she was kidnapped and forced into a world called ""Hen Fighting.""\nA world in which women fight and bets are made upon their bloodshed.Lillian is forced to comply due to the threats made upon her mother's life. Being a loving person her whole life, Lillian finds difficulty grasping her new functions. As she is conditioned to live in her new world, she is subjected to an experimental procedure. A procedure which has taken the lives of a few before her. As she survives, she now has to learn how to live with her new ""implants."" Implants which strengthen her bones, giving her strength and an upper ability amongst others. Implants which require weekly sustenance, or she will die.",,https://www.goodreads.com/book/show/25742454-the-switchblade-mamma,"[{'author_id': '8551671', 'role': ''}]",,,,,,,,https://www.goodreads.com/book/show/25742454-the-switchblade-mamma,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,25742454,1,42749946,The Switchblade Mamma,The Switchblade Mamma
1,2205073346.0,2,[],US,fre,"[{'count': '2', 'name': 'bd'}, {'count': '2', 'name': 'to-read'}, {'count': '1', 'name': 'french-author'}, {'count': '1', 'name': 'female-author'}, {'count': '1', 'name': 'mars-2017'}, {'count': '1', 'name': 'animals'}, {'count': '1', 'name': 'non-fiction'}, {'count': '1', 'name': 'autobiographical'}, {'count': '1', 'name': 'graphique-bd'}, {'count': '1', 'name': 'graphic-novel'}, {'count': '1', 'name': 'roman-graphique'}, {'count': '1', 'name': 'écrivaines'}]",,False,3.94,,[],"Florence Dupre Latour raconte comment, de son enfance jusqu'a la fin de son adolescence, elle a torture, mutile, tue les petits animaux de compagnie qui lui passaient entre les mains. Version trash des Malheurs de Sophie, ce recit est stupefiant, singulier et plein d'humour. L'auteure est cruelle mais nous renvoie a une verite universelle : un bambin qui joue, c'est aussi un redoutable predateur, un Attila ivre de conquetes et de pouvoir, un savant fou pret a toutes les experiences...",,https://www.goodreads.com/book/show/30128855-cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,22.0,,1.0,,2016.0,https://www.goodreads.com/book/show/30128855-cruelle,https://images.gr-assets.com/books/1462644346m/30128855.jpg,30128855,16,50558228,Cruelle,Cruelle
2,,5,"[246830, 362583, 362581, 623032]",US,eng,"[{'count': '493', 'name': 'to-read'}, {'count': '113', 'name': 'graphic-novels'}, {'count': '102', 'name': 'comics'}, {'count': '97', 'name': 'marvel'}, {'count': '36', 'name': 'captain-america'}, {'count': '35', 'name': 'graphic-novel'}, {'count': '32', 'name': 'comic-books'}, {'count': '31', 'name': 'currently-reading'}, {'count': '23', 'name': 'superheroes'}, {'count': '22', 'name': 'favorites'}, {'count': '20', 'name': 'marvel-comics'}, {'count': '20', 'name': 'comics-graphic-novels'}, {'count': '19', 'name': 'superhero'}, {'count': '18', 'name': 'comic'}, {'count': '17', 'name': 'fiction'}, {'count': '12', 'name': 'graphic-novels-comics'}, {'count': '11', 'name': 'owned'}, {'count': '10', 'name': 'comics-and-graphic-novels'}, {'count': '10', 'name': 'comics-read'}, {'count': '7', 'name': 'read-in-2014'}, {'count': '5', 'name': 'comics-marvel'}, {'count': '5', 'name': 'read-in-2015'}, {'count': '5', 'name': 'marvel-unlimited'}, {'count': '5', 'name': 'ebook'}, {'count': '5', 'name': 'read-comics'}, {'count': '5', 'name': 'super-hero'}, {'count': '4', 'name': 'comic-book'}, {'count': '4', 'name': 'comixology'}, {'count': '4', 'name': 'books-i-own'}, {'count': '4', 'name': 'graphic-novels-and-comics'}, {'count': '4', 'name': 'graphic'}, {'count': '4', 'name': 'espionage'}, {'count': '3', 'name': 'comics-gn'}, {'count': '3', 'name': 'graphic-novel-comics'}, {'count': '3', 'name': 'series'}, {'count': '3', 'name': 'sci-fi'}, {'count': '3', 'name': 'universo-marvel'}, {'count': '3', 'name': 'comics-graphicnovels'}, {'count': '3', 'name': 'comics-manga-graphic-novels'}, {'count': '3', 'name': 'manga-comics'}, {'count': '3', 'name': 'e-books'}, {'count': '3', 'name': 'e-book'}, {'count': '3', 'name': 'digital'}, {'count': '3', 'name': 'my-books'}, {'count': '3', 'name': 'library-books'}, {'count': '3', 'name': 'comics-to-read'}, {'count': '3', 'name': 'brubaker'}, {'count': '3', 'name': 'english'}, {'count': '2', 'name': 'unfinished'}, {'count': '2', 'name': 'default'}, {'count': '2', 'name': 'kindle'}, {'count': '2', 'name': 'quadrinhos'}, {'count': '2', 'name': '4-stars'}, {'count': '2', 'name': 'graphics'}, {'count': '2', 'name': 'hq'}, {'count': '2', 'name': 'read-in-2016'}, {'count': '2', 'name': 'to-buy'}, {'count': '2', 'name': 'comicbooks'}, {'count': '2', 'name': 'on-my-bookshelf'}, {'count': '2', 'name': 'graphic-novels-read'}, {'count': '2', 'name': 'science-fiction'}, {'count': '2', 'name': 'comics-manga'}, {'count': '2', 'name': 'comic-americano'}, {'count': '2', 'name': 'ed-arg'}, {'count': '2', 'name': 'read-2015'}, {'count': '2', 'name': 'winter-soldier'}, {'count': '2', 'name': 'fumetti'}, {'count': '2', 'name': 'usa'}, {'count': '2', 'name': 'owned-books'}, {'count': '2', 'name': 'comic-books-and-graphic-novels'}, {'count': '2', 'name': '2014-reads'}, {'count': '2', 'name': 'comics-graphic-novels-manga'}, {'count': '2', 'name': 'adventure'}, {'count': '2', 'name': 'male-authors'}, {'count': '2', 'name': 'favorite-books'}, {'count': '2', 'name': 'fantasy'}, {'count': '2', 'name': 'superhero-comics'}, {'count': '2', 'name': 'graphic_novels'}, {'count': '2', 'name': 'tyler'}, {'count': '2', 'name': 'marvel-graphic-novels'}, {'count': '2', 'name': 'sequential-art'}, {'count': '2', 'name': 'series-marvel'}, {'count': '2', 'name': 'borrowed'}, {'count': '1', 'name': 'komiks'}, {'count': '1', 'name': 'srpski'}, {'count': '1', 'name': 'stripovi'}, {'count': '1', 'name': 'kindle-unlimited'}, {'count': '1', 'name': 'organize-in-excel'}, {'count': '1', 'name': '12-modificaçoes-a-fazer'}, {'count': '1', 'name': 'marvel-collection'}, {'count': '1', 'name': 'graphic-read'}, {'count': '1', 'name': '2017-reading-challenge'}, {'count': '1', 'name': 'all-things-marvel-related'}, {'count': '1', 'name': 'marvel-reading-order'}, {'count': '1', 'name': 'books-added-in-2012'}, {'count': '1', 'name': 'stark-spangled-banner'}, {'count': '1', 'name': 'saw-the-movie'}, {'count': '1', 'name': 'read-in-august-2017'}, {'count': '1', 'name': 'read-in-2017'}, {'count': '1', 'name': 'one-day-reads'}]",,False,4.28,,"[13590139, 105963, 207585, 10503130, 4645370, 3995495, 17277814, 9293295, 1443066, 1014569, 670341, 43739, 331205, 844355, 2911494]","The questions plaguing Captain America's dreams and memories have been answered in the most brutal way possible. And in the wake of this brutality, General Lukin makes his first all-out assault - tearing open old wounds and threatening to make new scars that will never heal!",Hardcover,https://www.goodreads.com/book/show/13571772-captain-america,"[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,,,,,2012.0,https://www.goodreads.com/book/show/13571772-captain-america,https://images.gr-assets.com/books/1333287305m/13571772.jpg,13571772,51,102217,"Captain America: Winter Soldier (The Ultimate Graphic Novels Collection: Publication Order, #7)","Captain America: Winter Soldier (The Ultimate Graphic Novels Collection: Publication Order, #7)"
3,,1,[],US,eng,"[{'count': '222', 'name': 'to-read'}, {'count': '9', 'name': 'currently-reading'}, {'count': '3', 'name': 'military'}, {'count': '3', 'name': 'war'}, {'count': '2', 'name': 'memoirs'}, {'count': '2', 'name': 'biographical'}, {'count': '1', 'name': 'special-forces'}, {'count': '1', 'name': 'modern-war'}, {'count': '1', 'name': 'military-history'}, {'count': '1', 'name': 'history'}, {'count': '1', 'name': 'general-history'}, {'count': '1', 'name': 'finished-in-2017'}, {'count': '1', 'name': 'american-history'}, {'count': '1', 'name': 'special-operations'}, {'count': '1', 'name': 'military-biography'}, {'count': '1', 'name': 'library'}, {'count': '1', 'name': 'iraq-war'}, {'count': '1', 'name': 'netgalley'}, {'count': '1', 'name': 'nonfiction'}, {'count': '1', 'name': 'biography-memoir'}, {'count': '1', 'name': 'owned'}, {'count': '1', 'name': 'giveaways'}, {'count': '1', 'name': 'favorites'}, {'count': '1', 'name': 'war-books'}, {'count': '1', 'name': 'autobiographical'}, {'count': '1', 'name': 'memoir'}]",B06XKGGSB7,True,4.05,B06XKGGSB7,[],"The fight for Jason Delgado's life and soul began when he was just a boy. He ultimately escaped the death and drugs of a crime-riddled Bronx by way of the United States Marine Corps. However, after earning his way into the esteemed ranks of the service's famed Scout Snipers, Delgado saw that old struggle reignited when he was dumped into the hell of war in Iraq.\nThere Delgado proved not only a participant, but a warrior capable of turning the tide in several of the most harrowing and historically important battles of the evolving war. He took all the hard lessons learned in combat and, as MARSOC's original lead sniper instructor, made himself a pivotal figure in revolutionizing the way special operations snipers trained and operated. But even after accomplishing his mission in the military, Delgado still faced that original fight, struggling to understand and accept the man his experiences had transformed him into.",,https://www.goodreads.com/book/show/35452242-bounty-hunter-4-3,"[{'author_id': '16209952', 'role': ''}, {'author_id': '853385', 'role': ''}]",,,,,,,,https://www.goodreads.com/book/show/35452242-bounty-hunter-4-3,https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png,35452242,6,54276229,Bounty Hunter 4/3: My Life in Combat from Marine Scout Sniper to MARSOC,Bounty Hunter 4/3: My Life in Combat from Marine Scout Sniper to MARSOC
4,930289765.0,6,"[266759, 1096220]",US,en-US,"[{'count': '20', 'name': 'to-read'}, {'count': '8', 'name': 'comics'}, {'count': '4', 'name': 'graphic-novel'}, {'count': '3', 'name': 'superhero'}, {'count': '3', 'name': 'superman'}, {'count': '3', 'name': 'graphic-novels'}, {'count': '2', 'name': 'dc-archives'}, {'count': '1', 'name': 'dc-publishing'}, {'count': '1', 'name': 'archive-edition'}, {'count': '1', 'name': '20th-century'}, {'count': '1', 'name': 'to-get-from-library'}, {'count': '1', 'name': 'science-fiction'}, {'count': '1', 'name': 'dc-comics'}, {'count': '1', 'name': 'series'}, {'count': '1', 'name': 'illustrated'}, {'count': '1', 'name': 'comic-book'}, {'count': '1', 'name': 'read-in-2015'}, {'count': '1', 'name': 'graphical-novel'}, {'count': '1', 'name': 'comics-graphic-novels'}, {'count': '1', 'name': 'read-2013'}, {'count': '1', 'name': 'it-wikipedia'}, {'count': '1', 'name': 'comics-dc-archives'}, {'count': '1', 'name': 'comics-and-comix'}, {'count': '1', 'name': 'superheroes'}, {'count': '1', 'name': 'all'}, {'count': '1', 'name': 'shared-universes'}, {'count': '1', 'name': 'returned-library'}, {'count': '1', 'name': 'cartoons'}]",,False,4.06,,[],"These are the stories that catapulted Superman into the spotlight as one of the world's premier heroes of fiction. These volumes feature his earliest adventures, when the full extent of his powers was still developing and his foes were often bank robbers and crooked politicians.",Hardcover,https://www.goodreads.com/book/show/707611.Superman_Archives_Vol_2,"[{'author_id': '81563', 'role': ''}, {'author_id': '89537', 'role': 'Illustrator'}]",DC Comics,272.0,14.0,9780930289768.0,11.0,,1997.0,https://www.goodreads.com/book/show/707611.Superman_Archives_Vol_2,https://images.gr-assets.com/books/1307838888m/707611.jpg,707611,51,693886,"Superman Archives, Vol. 2","Superman Archives, Vol. 2"


----- goodreads_interactions_comics_graphic.json.gz-----
Size of file is 386.854954MB


100000it [00:06, 15327.22it/s]


Break reading after 100000 records
Reading 100001 rows.


Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,836610,6b4db26aafeaf0da77c7de6214331e1e,False,0,,Mon Aug 21 12:11:00 -0700 2017,Mon Aug 21 12:11:00 -0700 2017,,
1,8842281e1d1347389f2ab93d60773d4d,7648967,99b27059f711c37de8f90ee8e4dc0d1b,False,0,,Fri Feb 24 08:59:44 -0800 2017,Fri Feb 24 08:59:44 -0800 2017,,
2,8842281e1d1347389f2ab93d60773d4d,15704307,cb944d94854df5afd22210bb0aa0c903,False,0,,Wed May 20 21:28:56 -0700 2015,Wed May 20 21:28:57 -0700 2015,,
3,8842281e1d1347389f2ab93d60773d4d,6902644,2711bac2a8cc600dae1590a6ca0edb34,False,0,,Sun Jun 01 17:25:23 -0700 2014,Sun Jun 01 17:25:23 -0700 2014,,
4,8842281e1d1347389f2ab93d60773d4d,9844623,b72979076d1cded25dded922195e5b1c,False,0,,Sun Sep 02 08:45:08 -0700 2012,Sun Sep 02 08:45:08 -0700 2012,,


----- goodreads_reviews_comics_graphic.json.gz-----
Size of file is 146.582543MB


100000it [00:07, 13815.90it/s]


Break reading after 100000 records
Reading 100001 rows.


Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,dc3763cdb9b2cae805882878eebb6a32,18471619,66b2ba840f9bd36d6d27f46136fe4772,3,"Sherlock Holmes and the Vampires of London \n Release Date: April 2014 \n Publisher: Darkhorse Comics \n Story by: Sylvain Cordurie \n Art by: Laci \n Colors by: Axel Gonzabo \n Cover by: Jean Sebastien Rossbach \n ISDN: 9781616552664 \n MSRP: $17.99 Hardcover \n ""Sherlock Holmes died fighting Professor Moriarty in the Reichenbach Falls. \n At least, that's what the press claims. \n However, Holmes is alive and well and taking advantage of his presumed death to travel the globe. \n Unfortunately, Holmes's plans are thwarted when a plague of vampirism haunts Britain. \n This book collects Sherlock Holmes and the Vampires of London Volumes 1 and 2, originally created by French publisher Soleil."" - Darkhorse Comics \n When I received this copy of ""Sherlock Holmes and the Vampires of London"" I was Ecstatic! The cover art was awesome and it was about two of my favorite things, Sherlock Holmes and Vampires. I couldn't wait to dive into this! \n Unfortunately, that is where my excitement ended. The story takes place a month after Sherlock Holmes supposed death in his battle with Professor Moriarty. Sherlock's plan to stay hidden and out of site are ruined when on a trip with his brother Mycroft, they stumble on the presence of vampires. That is about as much of Sherlock's character that comes through the book. I can't even tell you the story really because nothing and I mean nothing stuck with me after reading it. I never, ever got the sense of Sherlock Holmes anywhere in this graphic novel, nor any real sense of mystery or crime. It was just Sherlock somehow battling vampires that should have had absolutely no trouble snuffing him out in a fight, but somehow always surviving and holding his own against supernatural, super powerful, blazingly fast creatures. \n The cover art is awesome and it truly made me excited to read this but everything else feel completely flat for me. I tried telling myself that ""it's a graphic novel, it would be hard to translate mystery, details, emotion"" but then I remembered reading DC Comic's ""Identity Crisis"" and realized that was a load of crap. I know it's unfair to compare the two as ""Identity Crisis"" had popular mystery author Brad Meltzer writing it right? Yeah....no. The standard was set that day and there is more than enough talent out there to create a great story in a graphic novel. \n That being said, it wasn't a horrible story, it just didn't grip me for feel anything like Sherlock Holmes to me. It was easy enough to follow but I felt no sense of tension, stakes or compassion for any of the characters. \n As far as the vampires go, it's hard to know what to expect anymore as there are so many different versions these days. This was the more classic version which I personally prefer, but again I didn't find anything that portrayed their dominance, calm confidence or sexuality. There was definitely a presence of their physical prowess but somehow that was lost on me as easily as Sherlock was able to defend himself. I know it, wouldn't do to kill of the main character, but this would have a been a great opportunity to build around the experience and beguiling nature of a vampire that had lived so many years of experience. Another chance to showcase Sherlock's intellect in a battle of wits over strength in something more suitable for this sort of story as apposed to trying to make it feel like an action movie. \n Maybe I expected to much and hoped to have at least a gripping premise or some sort of interesting plot or mystery but I didn't find it here. This may be a must have for serious Sherlock Holmes fans that have to collect everything about him, but if you are looking for a great story inside a graphic novel, I would have to say pass on this one. \n That artwork is good, cover is great, story is lacking so I am giving it 2.5 out of 5 stars.",Thu Dec 05 10:44:25 -0800 2013,Thu Dec 05 10:45:15 -0800 2013,Tue Nov 05 00:00:00 -0800 2013,,0,0
1,bafc2d50014200cda7cb2b6acd60cd73,6315584,72f1229aba5a88f9e72f0dcdc007dd22,4,"I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!",Wed Aug 10 06:06:48 -0700 2016,Fri Aug 12 08:49:54 -0700 2016,Fri Aug 12 08:49:54 -0700 2016,Wed Aug 10 00:00:00 -0700 2016,0,0
2,bafc2d50014200cda7cb2b6acd60cd73,29847729,a75309355f8662caaa5e2c92ab693d3f,4,"A very quick introduction, this is coming out in 6 parts and I think I may wait until the next 5 have been released to read this. It was a bit too brief for my liking. \n This clearly takes place at an earlier time rather than continuing from Skin Game. In fact, this clearly takes place prior to Ghost Story. Thomas is in his own apartment but Harry and Murphy still have some shaky ground between them. \n So far, in this comic we have, Harry of course (and we see Mouse in one strip), Molly, Murphy, Thomas and Butters. I won't give anything away but I do look forward to reading the rest as we wait for the next novel.",Thu Apr 21 07:44:00 -0700 2016,Thu Apr 21 07:59:28 -0700 2016,Thu Apr 21 07:59:28 -0700 2016,Thu Apr 21 00:00:00 -0700 2016,0,0
3,bafc2d50014200cda7cb2b6acd60cd73,18454118,c3cc5a3e1d6b6c9cf1c044f306c8e752,5,"I've been waiting so long for this. I first stumbled upon a partial fan translation a few years back and have been going crazy trying to find more. I was over the moon when I saw that it was getting an official English translation and bought it the second it went up on Akadot. \n The first volume has a lot of background and character establishment. Not a whole lot happens in the first volume but we do learn a lot about the personalities of our main characters Yuto and Dick (and really, I wish he had a different name). I can't wait for the next volume! \n The illustrations are incredible. It is absolutely beautiful. \n The manga is set in a prison so violence is inevitable. This is a little bit like a yaoi manga version of HBO's Oz. I'm not sure just how plausible the cons background stories are but hey, if you're looking for hyper realism why are you reading yaoi manga? \n I'm a huge Saki Aida fan and I am hoping beyond hope that they will choose to translate the novel as well. \n If you like sunshine and puppies then you probably shouldn't pick up a book that takes place in a prison but if you enjoy a bit of violence and some beautiful art, give this a try!",Mon Mar 03 17:45:56 -0800 2014,Mon Mar 03 17:54:11 -0800 2014,Sat Mar 01 00:00:00 -0800 2014,Sat Mar 01 00:00:00 -0800 2014,1,0
4,bafc2d50014200cda7cb2b6acd60cd73,2239435,cc444be37ab0a42bfb4dd818cb5edd10,4,"The only thing more entertaining than this book are the negative reviews of this book! \n I've always been a fan of Batman. Why? Because he's not Superman. He's darker. He's badder. He's an anti-hero superhero. \n I recently discovered that Batman has had a history of abusive behaviour towards Robin. On my quest to hunt down all images I could find of Batman smacking Robin around or otherwise being a jerk to him I stumbled on to ""All Star Batman and Robin"". A ha, thought I, if anything has Batman abusing Robin, it'll be this. Frank Miller you say? I believe at this point I was the one cackling. \n It was a very Sin City take on Batman, I'll give you that. I could hear Mickey Rourke in my head as I read through the narration. Batman is a little (maybe a lot) crazy. It actually makes sense. It makes sense that a man, who was just a child when his parents were murdered right in front of his eyes, went a little crazy. \n If you want sunshine and puppies and warm hugs this isn't for you. If you're looking for violence then you've found it. \n There are some ""issues"" that have been brought up. People talk about it being sexist, frankly it seems more like realistic to me but maybe that's the problem. We don't want to acknowledge that the world is not that different from Gotham and even our heroes turn out to be not so nice. \n Homophobic? Again, realistic. Don't believe me? Go listen to a bunch of young boys talk some time and suddenly Robin calling the Batmobile ""queer"" doesn't seem so far fetched. \n Again, one of the reasons I've always loved Batman is because it's a little more realistic than some of the others. No flying man shooting lasers and becoming a weakling whenever someone waves a glowing rock at him. (Sorry I don't really hate Superman, truly I don't.) Batman is a guy who has trained himself to fight, who has come up with inventions to help him win his fights and who doesn't always do the right thing because he's only human. \n Just because I enjoy the book (or anyone reading it enjoys it) doesn't mean we condone the behaviour of the characters within. But I have a surprise for you because they're fictional, they aren't real, Batman doesn't actually exist and I'm able to take it for what it is, entertainment. \n This is what I like when it comes to comics, no, not child abuse, but that it's not for kids. Not at all. This is for adults. It's not nice. It's not G rated. It's not PG. It shouldn't be. \n Before I go I want to mention the art. That anyone would bash on the art blows my mind. Honestly, I don't get it. The art is stellar. It's not even necessarily what I like but that doesn't diminish how fantastic it is. And someone mentioned Batman getting facial hair when he didn't have it when he started out his night. Some men grow facial hair faster than others. I know men who shave more than once a day because of it. It's really not unrealistic but I guess some folks love to nitpick. \n This book isn't for the faint of heart or those who like their superheroes with milk and cookies. But for the rest of you, it's worth checking out. I mean really, who doesn't want to see a superhero smacking around a 12 year old boy he's just kidnapped?!",Wed Apr 03 12:37:48 -0700 2013,Wed Apr 03 13:03:36 -0700 2013,Wed Apr 03 13:03:36 -0700 2013,,0,0


# 2. Clean data
Example of:
- Merging two files
- tqdm pd.progress_apply
- Example of non-destructive transforms, i.e. keep original data and make re-running cell works
- Parsing dates

In [9]:
#books
books_df = pd.read_json(goodreads_path + books, lines=True)
books_df = books_df[['book_id',	'title','authors',	'publisher',	'num_pages',	'publication_year']]
display(books_df.head(5))

Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0
2,13571772,"Captain America: Winter Soldier (The Ultimate Graphic Novels Collection: Publication Order, #7)","[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0
3,35452242,Bounty Hunter 4/3: My Life in Combat from Marine Scout Sniper to MARSOC,"[{'author_id': '16209952', 'role': ''}, {'author_id': '853385', 'role': ''}]",,,
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_id': '89537', 'role': 'Illustrator'}]",DC Comics,272.0,1997.0


In [10]:
#get author names (authors metadata is an additional dowload from goodreads)
authors = '/metadata/' + 'goodreads_book_authors.json.gz'
authors_df =  pd.read_json(goodreads_path + authors, lines=True) #829.529 authors (also non-graphic and comics)
display(authors_df.head(5))

Unnamed: 0,average_rating,author_id,text_reviews_count,name,ratings_count
0,3.98,604031,7,Ronald J. Fields,49
1,4.08,626222,28716,Anita Diamant,546796
2,3.92,10333,5075,Barbara Hambly,122118
3,3.68,9212,36262,Jennifer Weiner,888522
4,3.82,149918,96,Nigel Pennick,1740


In [12]:
#merge, but inline for each row, since each book has many authors
author_id_to_name = {}
for idx, row in tqdm(authors_df.iterrows(), total=authors_df.shape[0]):
  author_id_to_name[row['author_id']] = row['name']
display(books_df.head(5))
#important: type of author is np.int64
books_df['author_name'] = books_df['authors'].apply(lambda authors_dct_lst: author_id_to_name.get(np.int64(authors_dct_lst[0]['author_id'])))
display(books_df.head(5))

100%|██████████| 829529/829529 [01:16<00:00, 10904.51it/s]


Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0
2,13571772,"Captain America: Winter Soldier (The Ultimate Graphic Novels Collection: Publication Order, #7)","[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0
3,35452242,Bounty Hunter 4/3: My Life in Combat from Marine Scout Sniper to MARSOC,"[{'author_id': '16209952', 'role': ''}, {'author_id': '853385', 'role': ''}]",,,
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_id': '89537', 'role': 'Illustrator'}]",DC Comics,272.0,1997.0


Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year,author_name
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,,Lindsey Schussman
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0,Florence Dupre la Tour
2,13571772,"Captain America: Winter Soldier (The Ultimate Graphic Novels Collection: Publication Order, #7)","[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0,Ed Brubaker
3,35452242,Bounty Hunter 4/3: My Life in Combat from Marine Scout Sniper to MARSOC,"[{'author_id': '16209952', 'role': ''}, {'author_id': '853385', 'role': ''}]",,,,Jason Delgado
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_id': '89537', 'role': 'Illustrator'}]",DC Comics,272.0,1997.0,Jerry Siegel


In [13]:
#interactions
#wc -l interactions is 7.347.630 
#sample first 500.000 interactions
interactions_df = parse_json(goodreads_path + interactions, read_max=500000) #Note: RAM issue if loading with pd.read_json, no issue with parse_json 

500000it [00:21, 23651.48it/s]


Break reading after 500000 records
Reading 500001 rows.


In [14]:
#1) parse date 
interactions_df_new = interactions_df[['user_id', 'book_id', 'rating', 'date_updated']]
format_str = '%a %b %d %H:%M:%S %z %Y' #see https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
#test: datetime_object = datetime.strptime('Fri Jun 21 10:25:05 -0700 2013', format_str) 
interactions_df_new['date_updated'] = interactions_df_new['date_updated'].progress_apply(lambda s: np.datetime64(datetime.strptime(s,format_str)))

#2) sort on user_id, then date
interactions_df_new = interactions_df_new.sort_values(by=['user_id', 'date_updated'], ascending=[True,True])
display(interactions_df_new.head(20))

  """
100%|██████████| 500001/500001 [00:14<00:00, 34483.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,user_id,book_id,rating,date_updated
496200,00153d136ac254437511fad5e10e246d,7619292,3,2012-08-30 18:53:39
496199,00153d136ac254437511fad5e10e246d,29396738,0,2017-08-07 10:40:09
350064,0018ce6296baeccf95c3502deeff0600,472331,5,2012-08-05 16:26:00
350062,0018ce6296baeccf95c3502deeff0600,3058907,4,2012-08-05 20:11:48
350061,0018ce6296baeccf95c3502deeff0600,4280231,4,2012-08-05 20:11:52
350059,0018ce6296baeccf95c3502deeff0600,11470711,5,2012-08-05 20:14:57
350058,0018ce6296baeccf95c3502deeff0600,844355,4,2012-08-05 20:19:32
350057,0018ce6296baeccf95c3502deeff0600,1568491,4,2012-08-05 20:19:39
350056,0018ce6296baeccf95c3502deeff0600,23754,4,2012-08-09 10:53:26
350063,0018ce6296baeccf95c3502deeff0600,13480845,5,2012-08-15 11:26:40


#3. Create consecutive ID's
- Working with numpy types != python types
- Mapping ID's to consecutive integgers for matrix operations (and scipy sparse matrices, see https://docs.scipy.org/doc/scipy/reference/sparse.html) 

In [15]:
#change type book_id to numpy.int64
display(interactions_df_new.dtypes)
interactions_df_new['book_id'] = interactions_df_new['book_id'].astype('int64')
display(interactions_df_new.dtypes)

user_id                 object
book_id                 object
rating                   int64
date_updated    datetime64[ns]
dtype: object

user_id                 object
book_id                  int64
rating                   int64
date_updated    datetime64[ns]
dtype: object

In [16]:
#1) convert user uuid to consecutive integer ID's 
dct = {}
def map_to_consecutive_id(uuid):
  if uuid in dct:
    return dct[uuid]
  else:
    id = len(dct)
    dct[uuid] = id
    return id
interactions_df_new['user_id_int'] = interactions_df_new['user_id'].progress_apply(map_to_consecutive_id)

#2) convert book_id to to consecutive integer ID's 
dct.clear()
books_df['book_id_int'] = books_df['book_id'].progress_apply(map_to_consecutive_id)
interactions_df_new['book_id_int'] = interactions_df_new['book_id'].progress_apply(lambda book_id: dct.get(book_id,-1))
display(books_df.head(10))
display(interactions_df_new.head(10))


100%|██████████| 500001/500001 [00:00<00:00, 696562.32it/s]
100%|██████████| 89411/89411 [00:00<00:00, 513964.70it/s]
100%|██████████| 500001/500001 [00:00<00:00, 532106.90it/s]


Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year,author_name,book_id_int
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,,Lindsey Schussman,0
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0,Florence Dupre la Tour,1
2,13571772,"Captain America: Winter Soldier (The Ultimate Graphic Novels Collection: Publication Order, #7)","[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0,Ed Brubaker,2
3,35452242,Bounty Hunter 4/3: My Life in Combat from Marine Scout Sniper to MARSOC,"[{'author_id': '16209952', 'role': ''}, {'author_id': '853385', 'role': ''}]",,,,Jason Delgado,3
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_id': '89537', 'role': 'Illustrator'}]",DC Comics,272.0,1997.0,Jerry Siegel,4
5,2250580,"A.I. Revolution, Vol. 1","[{'author_id': '1015982', 'role': ''}]",Go! Comi,206.0,2007.0,Yuu Asami,5
6,27036536,"War Stories, Volume 3","[{'author_id': '14965', 'role': ''}, {'author_id': '3188368', 'role': 'Illustrations'}, {'author_id': '131836', 'role': 'Illustrations'}, {'author_id': '7507599', 'role': ''}]",Avatar Press,224.0,2016.0,Garth Ennis,6
7,27036537,"Crossed, Volume 15","[{'author_id': '24594', 'role': ''}]",Avatar Press,160.0,2016.0,Mike Wolfer,7
8,27036538,"Crossed + One Hundred, Volume 2 (Crossed +100 #2)","[{'author_id': '14155472', 'role': ''}, {'author_id': '8224446', 'role': 'Illustrations'}, {'author_id': '1251983', 'role': 'Illustrator'}, {'author_id': '5808419', 'role': 'Colorist'}, {'author_id': '4346284', 'role': 'Letterer'}]",Avatar Press,160.0,2016.0,Simon Spurrier,8
9,27036539,"War Stories, Volume 4","[{'author_id': '14965', 'role': ''}, {'author_id': '3188368', 'role': 'Illustrations'}]",Avatar Press,144.0,2016.0,Garth Ennis,9


Unnamed: 0,user_id,book_id,rating,date_updated,user_id_int,book_id_int
496200,00153d136ac254437511fad5e10e246d,7619292,3,2012-08-30 18:53:39,0,73347
496199,00153d136ac254437511fad5e10e246d,29396738,0,2017-08-07 10:40:09,0,2624
350064,0018ce6296baeccf95c3502deeff0600,472331,5,2012-08-05 16:26:00,1,2749
350062,0018ce6296baeccf95c3502deeff0600,3058907,4,2012-08-05 20:11:48,1,55521
350061,0018ce6296baeccf95c3502deeff0600,4280231,4,2012-08-05 20:11:52,1,59877
350059,0018ce6296baeccf95c3502deeff0600,11470711,5,2012-08-05 20:14:57,1,20568
350058,0018ce6296baeccf95c3502deeff0600,844355,4,2012-08-05 20:19:32,1,22652
350057,0018ce6296baeccf95c3502deeff0600,1568491,4,2012-08-05 20:19:39,1,88449
350056,0018ce6296baeccf95c3502deeff0600,23754,4,2012-08-09 10:53:26,1,85188
350063,0018ce6296baeccf95c3502deeff0600,13480845,5,2012-08-15 11:26:40,1,24277


# 4. Pre-process interactions
- Drop reconsumption items
- Remove items with fewer than x interactions
- Remove users with fewer than x interactions

In [17]:
def preprocess_classic(df, minsup=5):
    """
    Goal: - Remove reconsumption items
          - Remove items that have less than minsup interactions 
          - Remove users that have less than minsup interactions 
               
    :input df: Dataframe containing user_id, item_id and time
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size()
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size()
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    display(df.head(5))
    #drop items occurring less than minsup times
    before = df.shape[0]
    df = df[df['count_item'] >= minsup]
    print("After dropping items with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
    before = df.shape[0]
    #drop users with less then minsup items in history
    df = df[df['count_user'] >= minsup]
    df = df[['user_id','item_id','datetime']]
    print("After dropping users with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
    return df

#Remark: ignoring rating, considering all reviews as implicit possitive feedback
#print number of users and items
interactions_df_processed = interactions_df_new[['user_id_int','book_id_int','date_updated']]
interactions_df_processed = interactions_df_processed.rename(columns={"user_id_int": "user_id", "book_id_int": "item_id", "date_updated": "datetime"})
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")
interactions_df_processed = preprocess_classic(interactions_df_processed)
display(interactions_df_processed.head(5))
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")

number of unique users: 18246
number of unique items: 53635
After drop_duplicates (reconsumption items): 500001 -> 500001


Unnamed: 0,user_id,item_id,datetime,count_item,count_user
0,0,73347,2012-08-30 18:53:39,305,2
1,0,2624,2017-08-07 10:40:09,501,2
2,1,2749,2012-08-05 16:26:00,2900,16
3,1,55521,2012-08-05 20:11:48,44,16
4,1,59877,2012-08-05 20:11:52,45,16


After dropping items with less than 5 interactions: 500001 -> 433879
After dropping users with less than 5 interactions: 433879 -> 417597


Unnamed: 0,user_id,item_id,datetime
2,1,2749,2012-08-05 16:26:00
3,1,55521,2012-08-05 20:11:48
4,1,59877,2012-08-05 20:11:52
5,1,20568,2012-08-05 20:14:57
6,1,22652,2012-08-05 20:19:32


number of unique users: 8924
number of unique items: 17714


# 5. Create train/test split
Different options:
- **Time-based split**, i.e. split interactions before/after certain date. Keep all users with both training and test interactions for evaluations. Repeat for different train/test window, i.e. repeat for a number of key dates.
- **Session-based split**, i.e. split and keep first $x$ interactions for training and $|I_u| - x$  for testing, where $I_u = {i_1,\ldots,i_k}$ represents the user's history sorted on time
- **Random**, i.e. take $x$ random items for training and $|I_u| -x$ for testing
- **Leave-one-out**, i.e. take last (or random) item for testing

Additionaly, use techniques such as cross-validation and create multiple train/test splits and report both average and standard deviation. 

In [18]:
#Session-based split:
sessions_df = interactions_df_processed.groupby(by='user_id', as_index=False)[['item_id','datetime']].agg(list)
display(sessions_df.head(10))

def split(items, percentage_train):
  no_train_items = math.floor(len(items) * percentage_train)
  return items[0:no_train_items], items[no_train_items:]

percentage_train = 0.8
sessions_df['history'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[0])
sessions_df['future'] = sessions_df['item_id'].apply(lambda items: split(items, percentage_train)[1])
pd.set_option("display.max_colwidth", None)
display(sessions_df.head(10))

Unnamed: 0,user_id,item_id,datetime
0,1,"[2749, 55521, 59877, 20568, 22652, 88449, 85188, 24277, 43962, 22625, 38290, 39322, 1138, 34891, 4357, 82031]","[2012-08-05 16:26:00, 2012-08-05 20:11:48, 2012-08-05 20:11:52, 2012-08-05 20:14:57, 2012-08-05 20:19:32, 2012-08-05 20:19:39, 2012-08-09 10:53:26, 2012-08-15 11:26:40, 2012-10-10 10:59:19, 2015-02-07 12:25:49, 2015-02-15 13:37:12, 2015-12-31 23:38:04, 2016-02-11 17:24:43, 2016-05-26 10:59:59, 2016-05-26 21:27:53, 2016-11-04 13:20:03]"
1,2,"[1027, 41468, 16631, 58923, 53726, 46006, 64726, 73752, 14801, 29249, 37807, 19336, 3538, 23144, 49402, 35229, 42349, 63723, 25723, 25102, 12486, 30798, 27287, 7451, 31264, 47066, 72410, 66174, 49698, 42948, 51386, 45277, 25166, 7312, 35230, 33786, 7453, 4638, 74125, 31011, 27020, 2227, 6966, 78927, 43133, 84695, 48499, 9003, 47405, 21629, 26229, 40175, 46051, 67864, 72738, 26055, 3495, 65107, 86659, 59594, 78876, 19669, 12857, 11919, 43665, 12859, 12860, 27594, 13848, 86762, 86763, 12858, 47692, 56805, 73683, 14521, 75808, 61540, 80678, 33784, 65604, 33787, 4637, 33788, 1731, 71432, 62766, 60146, 24566, 15737, 2844, 37218, 33245, 55364, 14941, 50264, 65442, 88536, 5382, 76189, ...]","[2015-04-05 18:03:44, 2015-04-05 18:04:01, 2015-04-05 18:04:35, 2015-04-05 18:04:39, 2015-04-05 18:05:09, 2015-04-05 18:05:14, 2015-04-13 05:32:51, 2015-04-13 05:32:53, 2015-04-29 05:33:11, 2015-04-29 05:50:37, 2015-11-07 08:24:58, 2015-12-06 05:24:14, 2015-12-06 21:28:40, 2016-02-08 21:27:53, 2016-02-08 21:27:57, 2016-02-08 21:27:59, 2016-02-12 21:02:50, 2016-04-02 00:41:27, 2016-04-16 23:06:50, 2016-04-16 23:10:19, 2016-06-14 21:16:39, 2016-07-03 11:38:34, 2016-07-03 11:38:42, 2016-07-09 05:57:24, 2016-07-09 05:58:39, 2016-07-09 05:58:59, 2016-07-09 05:59:27, 2016-07-09 05:59:30, 2016-07-09 05:59:34, 2016-07-09 05:59:43, 2016-07-09 06:00:02, 2016-07-09 06:02:07, 2016-07-09 06:02:31, 2016-07-09 06:03:36, 2016-07-09 06:03:39, 2016-10-08 09:09:39, 2016-10-08 09:13:12, 2016-10-08 09:36:06, 2016-10-08 12:28:34, 2016-10-08 15:00:34, 2016-10-08 15:01:37, 2016-10-08 15:19:57, 2016-10-08 18:08:33, 2016-10-08 19:03:59, 2016-10-09 11:00:58, 2016-10-09 11:01:22, 2016-10-09 11:01:31, 2016-10-09 11:01:47, 2016-10-09 11:02:06, 2016-10-09 11:02:55, 2016-10-09 11:03:23, 2016-10-09 11:04:06, 2016-10-09 11:04:33, 2016-10-09 11:04:54, 2016-10-09 14:13:23, 2016-10-09 14:13:46, 2016-10-09 14:40:39, 2016-10-09 16:21:28, 2016-10-09 17:01:11, 2016-10-09 20:25:23, 2016-10-09 20:25:37, 2016-10-09 20:26:00, 2016-10-09 21:35:36, 2016-10-11 06:19:02, 2016-10-11 06:19:41, 2016-10-11 06:20:05, 2016-10-11 06:20:53, 2016-10-11 06:21:37, 2016-10-11 13:30:21, 2016-10-11 13:45:33, 2016-10-11 13:45:44, 2016-10-11 13:46:15, 2016-10-11 13:46:38, 2016-10-11 17:49:31, 2016-10-11 21:01:06, 2016-10-11 21:01:43, 2016-10-11 21:02:29, 2016-10-11 21:02:55, 2016-10-11 21:03:37, 2016-10-12 23:11:50, 2016-10-12 23:13:14, 2016-10-12 23:13:34, 2016-10-12 23:13:52, 2016-10-12 23:14:09, 2016-10-12 23:14:33, 2016-10-14 06:47:51, 2016-10-14 06:48:16, 2016-10-14 22:54:56, 2016-10-14 22:55:38, 2016-10-14 22:56:04, 2016-10-14 22:56:54, 2016-10-14 22:57:09, 2016-10-14 22:57:26, 2016-10-14 23:00:50, 2016-10-14 23:01:12, 2016-10-14 23:01:32, 2016-10-15 09:02:28, 2016-10-15 09:03:11, 2016-10-16 03:27:56, 2016-10-16 03:37:25, ...]"
2,3,"[78364, 2961, 17726, 47072, 2177, 72410, 25660, 24192, 46006, 1310, 39013, 469, 82477, 41468, 84182, 7921, 16572, 16573, 41635, 56776, 61307, 80848, 70755, 26129, 2624, 51087, 27227, 25189, 57748, 2946, 12843, 42538, 79275, 3495]","[2013-02-11 03:48:55, 2013-12-25 19:43:10, 2013-12-30 04:33:21, 2013-12-30 04:33:27, 2013-12-30 04:33:33, 2013-12-30 04:33:37, 2013-12-30 04:33:41, 2014-03-04 16:08:19, 2014-05-20 00:43:21, 2014-05-20 00:43:55, 2014-05-20 21:09:02, 2014-06-11 15:46:48, 2014-07-23 18:52:51, 2014-08-27 06:00:40, 2015-10-06 00:50:46, 2015-10-06 00:53:18, 2015-10-06 00:53:20, 2015-10-06 00:53:22, 2015-10-06 00:53:23, 2015-10-06 00:53:25, 2015-10-07 18:51:57, 2016-01-13 02:26:46, 2016-11-01 02:52:41, 2017-01-16 03:14:34, 2017-01-16 18:05:47, 2017-01-16 21:22:31, 2017-01-20 03:00:43, 2017-01-20 03:00:59, 2017-01-20 03:01:07, 2017-01-20 03:01:10, 2017-01-20 03:01:21, 2017-01-23 01:01:25, 2017-07-03 18:40:38, 2017-07-03 19:32:33]"
3,4,"[76189, 76190, 11457, 36906, 74822]","[2013-04-01 23:54:17, 2013-04-01 23:55:34, 2015-04-23 23:59:06, 2016-03-28 20:15:38, 2016-03-28 20:15:50]"
4,7,"[64368, 30375, 47642, 48180, 86871]","[2012-07-09 17:41:56, 2012-07-09 17:41:58, 2012-07-09 17:42:16, 2012-07-09 17:42:18, 2016-03-22 17:10:25]"
5,8,"[53051, 45693, 8483, 22848, 84627, 35607, 46555, 58655, 73033, 75878, 20728, 58654, 16401, 88029, 21663, 37477, 36836, 73710, 84762, 55409, 5703, 46449, 12984, 71644, 59144, 80010, 47916, 70258, 37480, 43524, 25121, 10001, 35360, 31294, 65584, 14587, 31861, 27969, 371, 57498]","[2015-08-13 23:45:26, 2017-05-02 23:33:45, 2017-05-03 21:49:47, 2017-05-04 01:24:34, 2017-05-13 18:18:21, 2017-05-15 12:26:32, 2017-05-17 20:48:37, 2017-05-24 21:07:48, 2017-05-30 20:13:33, 2017-05-31 14:07:52, 2017-06-06 19:54:22, 2017-06-11 00:17:56, 2017-06-19 03:02:22, 2017-07-06 16:36:14, 2017-07-10 17:55:24, 2017-07-26 20:09:41, 2017-07-27 23:25:55, 2017-07-27 23:27:01, 2017-07-27 23:27:52, 2017-08-02 14:50:56, 2017-08-05 03:58:12, 2017-08-05 21:00:29, 2017-08-06 13:44:23, 2017-08-09 17:13:27, 2017-08-12 07:18:05, 2017-08-19 14:28:46, 2017-08-28 15:37:12, 2017-08-29 00:53:26, 2017-08-30 18:15:43, 2017-08-30 18:16:34, 2017-09-06 02:47:05, 2017-09-06 19:04:36, 2017-09-19 20:53:40, 2017-09-25 19:21:08, 2017-09-26 21:59:11, 2017-09-28 23:33:15, 2017-10-08 21:49:35, 2017-10-08 21:50:10, 2017-10-08 21:51:19, 2017-10-14 20:07:30]"
6,9,"[7453, 7452, 2749, 78363, 469, 75042, 47066, 2227, 66277, 21382, 30149]","[2012-08-05 22:15:47, 2012-08-05 22:15:48, 2012-08-05 22:15:55, 2012-08-05 22:43:00, 2015-08-04 19:32:07, 2015-08-17 13:27:27, 2015-08-24 15:47:51, 2016-02-25 15:15:01, 2016-04-21 16:37:57, 2016-07-13 02:22:08, 2016-12-27 15:46:21]"
7,13,"[27184, 21382, 23144, 56143, 35230, 89011, 46079, 11640, 35229]","[2015-08-05 13:08:24, 2016-02-22 04:33:55, 2016-03-28 02:53:07, 2016-05-03 23:56:11, 2016-05-13 21:12:42, 2016-05-17 02:22:45, 2016-12-19 21:11:32, 2016-12-19 21:38:21, 2017-01-25 02:31:36]"
8,14,"[10740, 47650, 1502, 8845, 64592, 1503, 80998, 20192, 52304, 80999, 69407, 69408, 69636, 34345, 52305, 35383, 87949, 59717, 55169, 66054, 19354, 62179, 28150, 88873, 46010, 87919, 5043, 2020, 56510]","[2012-09-14 23:18:16, 2012-09-14 23:18:25, 2012-11-03 20:56:09, 2012-11-03 20:56:18, 2012-11-03 20:56:36, 2012-11-03 20:56:44, 2012-11-03 20:56:57, 2012-11-03 20:57:36, 2012-11-03 21:03:28, 2012-11-03 21:03:38, 2012-11-03 21:03:42, 2012-11-03 21:03:48, 2012-11-03 21:04:06, 2012-11-03 21:04:16, 2012-11-03 21:04:22, 2012-11-03 21:04:34, 2012-11-03 21:04:43, 2012-11-03 21:04:51, 2012-11-03 21:04:59, 2012-11-03 21:05:06, 2012-11-03 21:05:30, 2012-11-03 21:05:45, 2012-11-03 21:05:51, 2012-11-03 21:06:03, 2012-11-03 21:06:31, 2012-11-03 21:06:39, 2012-11-03 21:06:46, 2012-11-03 21:06:52, 2012-11-03 21:07:00]"
9,15,"[13910, 39098, 60104, 56608, 76156, 88900, 10364, 54441, 8724, 76191, 76190, 76189, 49712]","[2008-07-08 17:36:41, 2008-07-08 17:37:04, 2008-07-08 17:37:50, 2008-07-08 17:37:58, 2008-07-25 19:54:39, 2010-04-26 15:25:54, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2013-12-07 19:40:58]"


Unnamed: 0,user_id,item_id,datetime,history,future
0,1,"[2749, 55521, 59877, 20568, 22652, 88449, 85188, 24277, 43962, 22625, 38290, 39322, 1138, 34891, 4357, 82031]","[2012-08-05 16:26:00, 2012-08-05 20:11:48, 2012-08-05 20:11:52, 2012-08-05 20:14:57, 2012-08-05 20:19:32, 2012-08-05 20:19:39, 2012-08-09 10:53:26, 2012-08-15 11:26:40, 2012-10-10 10:59:19, 2015-02-07 12:25:49, 2015-02-15 13:37:12, 2015-12-31 23:38:04, 2016-02-11 17:24:43, 2016-05-26 10:59:59, 2016-05-26 21:27:53, 2016-11-04 13:20:03]","[2749, 55521, 59877, 20568, 22652, 88449, 85188, 24277, 43962, 22625, 38290, 39322]","[1138, 34891, 4357, 82031]"
1,2,"[1027, 41468, 16631, 58923, 53726, 46006, 64726, 73752, 14801, 29249, 37807, 19336, 3538, 23144, 49402, 35229, 42349, 63723, 25723, 25102, 12486, 30798, 27287, 7451, 31264, 47066, 72410, 66174, 49698, 42948, 51386, 45277, 25166, 7312, 35230, 33786, 7453, 4638, 74125, 31011, 27020, 2227, 6966, 78927, 43133, 84695, 48499, 9003, 47405, 21629, 26229, 40175, 46051, 67864, 72738, 26055, 3495, 65107, 86659, 59594, 78876, 19669, 12857, 11919, 43665, 12859, 12860, 27594, 13848, 86762, 86763, 12858, 47692, 56805, 73683, 14521, 75808, 61540, 80678, 33784, 65604, 33787, 4637, 33788, 1731, 71432, 62766, 60146, 24566, 15737, 2844, 37218, 33245, 55364, 14941, 50264, 65442, 88536, 5382, 76189, ...]","[2015-04-05 18:03:44, 2015-04-05 18:04:01, 2015-04-05 18:04:35, 2015-04-05 18:04:39, 2015-04-05 18:05:09, 2015-04-05 18:05:14, 2015-04-13 05:32:51, 2015-04-13 05:32:53, 2015-04-29 05:33:11, 2015-04-29 05:50:37, 2015-11-07 08:24:58, 2015-12-06 05:24:14, 2015-12-06 21:28:40, 2016-02-08 21:27:53, 2016-02-08 21:27:57, 2016-02-08 21:27:59, 2016-02-12 21:02:50, 2016-04-02 00:41:27, 2016-04-16 23:06:50, 2016-04-16 23:10:19, 2016-06-14 21:16:39, 2016-07-03 11:38:34, 2016-07-03 11:38:42, 2016-07-09 05:57:24, 2016-07-09 05:58:39, 2016-07-09 05:58:59, 2016-07-09 05:59:27, 2016-07-09 05:59:30, 2016-07-09 05:59:34, 2016-07-09 05:59:43, 2016-07-09 06:00:02, 2016-07-09 06:02:07, 2016-07-09 06:02:31, 2016-07-09 06:03:36, 2016-07-09 06:03:39, 2016-10-08 09:09:39, 2016-10-08 09:13:12, 2016-10-08 09:36:06, 2016-10-08 12:28:34, 2016-10-08 15:00:34, 2016-10-08 15:01:37, 2016-10-08 15:19:57, 2016-10-08 18:08:33, 2016-10-08 19:03:59, 2016-10-09 11:00:58, 2016-10-09 11:01:22, 2016-10-09 11:01:31, 2016-10-09 11:01:47, 2016-10-09 11:02:06, 2016-10-09 11:02:55, 2016-10-09 11:03:23, 2016-10-09 11:04:06, 2016-10-09 11:04:33, 2016-10-09 11:04:54, 2016-10-09 14:13:23, 2016-10-09 14:13:46, 2016-10-09 14:40:39, 2016-10-09 16:21:28, 2016-10-09 17:01:11, 2016-10-09 20:25:23, 2016-10-09 20:25:37, 2016-10-09 20:26:00, 2016-10-09 21:35:36, 2016-10-11 06:19:02, 2016-10-11 06:19:41, 2016-10-11 06:20:05, 2016-10-11 06:20:53, 2016-10-11 06:21:37, 2016-10-11 13:30:21, 2016-10-11 13:45:33, 2016-10-11 13:45:44, 2016-10-11 13:46:15, 2016-10-11 13:46:38, 2016-10-11 17:49:31, 2016-10-11 21:01:06, 2016-10-11 21:01:43, 2016-10-11 21:02:29, 2016-10-11 21:02:55, 2016-10-11 21:03:37, 2016-10-12 23:11:50, 2016-10-12 23:13:14, 2016-10-12 23:13:34, 2016-10-12 23:13:52, 2016-10-12 23:14:09, 2016-10-12 23:14:33, 2016-10-14 06:47:51, 2016-10-14 06:48:16, 2016-10-14 22:54:56, 2016-10-14 22:55:38, 2016-10-14 22:56:04, 2016-10-14 22:56:54, 2016-10-14 22:57:09, 2016-10-14 22:57:26, 2016-10-14 23:00:50, 2016-10-14 23:01:12, 2016-10-14 23:01:32, 2016-10-15 09:02:28, 2016-10-15 09:03:11, 2016-10-16 03:27:56, 2016-10-16 03:37:25, ...]","[1027, 41468, 16631, 58923, 53726, 46006, 64726, 73752, 14801, 29249, 37807, 19336, 3538, 23144, 49402, 35229, 42349, 63723, 25723, 25102, 12486, 30798, 27287, 7451, 31264, 47066, 72410, 66174, 49698, 42948, 51386, 45277, 25166, 7312, 35230, 33786, 7453, 4638, 74125, 31011, 27020, 2227, 6966, 78927, 43133, 84695, 48499, 9003, 47405, 21629, 26229, 40175, 46051, 67864, 72738, 26055, 3495, 65107, 86659, 59594, 78876, 19669, 12857, 11919, 43665, 12859, 12860, 27594, 13848, 86762, 86763, 12858, 47692, 56805, 73683, 14521, 75808, 61540, 80678, 33784, 65604, 33787, 4637, 33788, 1731, 71432, 62766, 60146, 24566, 15737, 2844, 37218, 33245, 55364, 14941, 50264, 65442, 88536, 5382, 76189, ...]","[1286, 38245, 47059, 77648, 64113, 9313, 37629, 75288, 13801, 7600, 88591, 68731, 24127, 46371, 13870, 1704, 48629, 18740, 75446, 17850, 27227, 12843, 38250, 19835, 64142, 80844, 85224, 28876, 19436, 16190, 53821, 16189, 53820, 50712, 70321, 66616, 73661, 21978, 76188, 5817, 50229, 33366, 81381, 86564, 58039, 42933, 55183, 27079, 1331, 42215, 78975, 461, 65242, 42470, 72184, 8966, 8918, 14448, 81368, 49316, 32565, 43343, 83816, 51677, 71967, 10081, 39584, 12593, 70230, 25238, 62434, 47633, 50243, 50241, 68949, 788, 74500, 80213, 64622, 24387, 3268, 61004, 80050, 45931, 80156, 25863, 70087, 52164, 48580, 48587, 57767, 57771, 57769, 61575, 36372, 57770, 36373, 57768, 64581, 8759, ...]"
2,3,"[78364, 2961, 17726, 47072, 2177, 72410, 25660, 24192, 46006, 1310, 39013, 469, 82477, 41468, 84182, 7921, 16572, 16573, 41635, 56776, 61307, 80848, 70755, 26129, 2624, 51087, 27227, 25189, 57748, 2946, 12843, 42538, 79275, 3495]","[2013-02-11 03:48:55, 2013-12-25 19:43:10, 2013-12-30 04:33:21, 2013-12-30 04:33:27, 2013-12-30 04:33:33, 2013-12-30 04:33:37, 2013-12-30 04:33:41, 2014-03-04 16:08:19, 2014-05-20 00:43:21, 2014-05-20 00:43:55, 2014-05-20 21:09:02, 2014-06-11 15:46:48, 2014-07-23 18:52:51, 2014-08-27 06:00:40, 2015-10-06 00:50:46, 2015-10-06 00:53:18, 2015-10-06 00:53:20, 2015-10-06 00:53:22, 2015-10-06 00:53:23, 2015-10-06 00:53:25, 2015-10-07 18:51:57, 2016-01-13 02:26:46, 2016-11-01 02:52:41, 2017-01-16 03:14:34, 2017-01-16 18:05:47, 2017-01-16 21:22:31, 2017-01-20 03:00:43, 2017-01-20 03:00:59, 2017-01-20 03:01:07, 2017-01-20 03:01:10, 2017-01-20 03:01:21, 2017-01-23 01:01:25, 2017-07-03 18:40:38, 2017-07-03 19:32:33]","[78364, 2961, 17726, 47072, 2177, 72410, 25660, 24192, 46006, 1310, 39013, 469, 82477, 41468, 84182, 7921, 16572, 16573, 41635, 56776, 61307, 80848, 70755, 26129, 2624, 51087, 27227]","[25189, 57748, 2946, 12843, 42538, 79275, 3495]"
3,4,"[76189, 76190, 11457, 36906, 74822]","[2013-04-01 23:54:17, 2013-04-01 23:55:34, 2015-04-23 23:59:06, 2016-03-28 20:15:38, 2016-03-28 20:15:50]","[76189, 76190, 11457, 36906]",[74822]
4,7,"[64368, 30375, 47642, 48180, 86871]","[2012-07-09 17:41:56, 2012-07-09 17:41:58, 2012-07-09 17:42:16, 2012-07-09 17:42:18, 2016-03-22 17:10:25]","[64368, 30375, 47642, 48180]",[86871]
5,8,"[53051, 45693, 8483, 22848, 84627, 35607, 46555, 58655, 73033, 75878, 20728, 58654, 16401, 88029, 21663, 37477, 36836, 73710, 84762, 55409, 5703, 46449, 12984, 71644, 59144, 80010, 47916, 70258, 37480, 43524, 25121, 10001, 35360, 31294, 65584, 14587, 31861, 27969, 371, 57498]","[2015-08-13 23:45:26, 2017-05-02 23:33:45, 2017-05-03 21:49:47, 2017-05-04 01:24:34, 2017-05-13 18:18:21, 2017-05-15 12:26:32, 2017-05-17 20:48:37, 2017-05-24 21:07:48, 2017-05-30 20:13:33, 2017-05-31 14:07:52, 2017-06-06 19:54:22, 2017-06-11 00:17:56, 2017-06-19 03:02:22, 2017-07-06 16:36:14, 2017-07-10 17:55:24, 2017-07-26 20:09:41, 2017-07-27 23:25:55, 2017-07-27 23:27:01, 2017-07-27 23:27:52, 2017-08-02 14:50:56, 2017-08-05 03:58:12, 2017-08-05 21:00:29, 2017-08-06 13:44:23, 2017-08-09 17:13:27, 2017-08-12 07:18:05, 2017-08-19 14:28:46, 2017-08-28 15:37:12, 2017-08-29 00:53:26, 2017-08-30 18:15:43, 2017-08-30 18:16:34, 2017-09-06 02:47:05, 2017-09-06 19:04:36, 2017-09-19 20:53:40, 2017-09-25 19:21:08, 2017-09-26 21:59:11, 2017-09-28 23:33:15, 2017-10-08 21:49:35, 2017-10-08 21:50:10, 2017-10-08 21:51:19, 2017-10-14 20:07:30]","[53051, 45693, 8483, 22848, 84627, 35607, 46555, 58655, 73033, 75878, 20728, 58654, 16401, 88029, 21663, 37477, 36836, 73710, 84762, 55409, 5703, 46449, 12984, 71644, 59144, 80010, 47916, 70258, 37480, 43524, 25121, 10001]","[35360, 31294, 65584, 14587, 31861, 27969, 371, 57498]"
6,9,"[7453, 7452, 2749, 78363, 469, 75042, 47066, 2227, 66277, 21382, 30149]","[2012-08-05 22:15:47, 2012-08-05 22:15:48, 2012-08-05 22:15:55, 2012-08-05 22:43:00, 2015-08-04 19:32:07, 2015-08-17 13:27:27, 2015-08-24 15:47:51, 2016-02-25 15:15:01, 2016-04-21 16:37:57, 2016-07-13 02:22:08, 2016-12-27 15:46:21]","[7453, 7452, 2749, 78363, 469, 75042, 47066, 2227]","[66277, 21382, 30149]"
7,13,"[27184, 21382, 23144, 56143, 35230, 89011, 46079, 11640, 35229]","[2015-08-05 13:08:24, 2016-02-22 04:33:55, 2016-03-28 02:53:07, 2016-05-03 23:56:11, 2016-05-13 21:12:42, 2016-05-17 02:22:45, 2016-12-19 21:11:32, 2016-12-19 21:38:21, 2017-01-25 02:31:36]","[27184, 21382, 23144, 56143, 35230, 89011, 46079]","[11640, 35229]"
8,14,"[10740, 47650, 1502, 8845, 64592, 1503, 80998, 20192, 52304, 80999, 69407, 69408, 69636, 34345, 52305, 35383, 87949, 59717, 55169, 66054, 19354, 62179, 28150, 88873, 46010, 87919, 5043, 2020, 56510]","[2012-09-14 23:18:16, 2012-09-14 23:18:25, 2012-11-03 20:56:09, 2012-11-03 20:56:18, 2012-11-03 20:56:36, 2012-11-03 20:56:44, 2012-11-03 20:56:57, 2012-11-03 20:57:36, 2012-11-03 21:03:28, 2012-11-03 21:03:38, 2012-11-03 21:03:42, 2012-11-03 21:03:48, 2012-11-03 21:04:06, 2012-11-03 21:04:16, 2012-11-03 21:04:22, 2012-11-03 21:04:34, 2012-11-03 21:04:43, 2012-11-03 21:04:51, 2012-11-03 21:04:59, 2012-11-03 21:05:06, 2012-11-03 21:05:30, 2012-11-03 21:05:45, 2012-11-03 21:05:51, 2012-11-03 21:06:03, 2012-11-03 21:06:31, 2012-11-03 21:06:39, 2012-11-03 21:06:46, 2012-11-03 21:06:52, 2012-11-03 21:07:00]","[10740, 47650, 1502, 8845, 64592, 1503, 80998, 20192, 52304, 80999, 69407, 69408, 69636, 34345, 52305, 35383, 87949, 59717, 55169, 66054, 19354, 62179, 28150]","[88873, 46010, 87919, 5043, 2020, 56510]"
9,15,"[13910, 39098, 60104, 56608, 76156, 88900, 10364, 54441, 8724, 76191, 76190, 76189, 49712]","[2008-07-08 17:36:41, 2008-07-08 17:37:04, 2008-07-08 17:37:50, 2008-07-08 17:37:58, 2008-07-25 19:54:39, 2010-04-26 15:25:54, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2012-08-11 03:02:08, 2013-12-07 19:40:58]","[13910, 39098, 60104, 56608, 76156, 88900, 10364, 54441, 8724, 76191]","[76190, 76189, 49712]"


# 6. Evaluate quantitatively
Options are:
- **Hitrate@k**, i.e. percentage of users where top-$k$ recommendations is relevant 
- **Recall@k**, i.e. percentage of top-$k$ recommendation that are relevant
- **NDCG@k**, i.e. like recall but rank of top-$k$ recommendation is weighted

Compare using relative gain, i.e. recall@10 from 10% tot 20% is a 100% gain (20-10/10 * 100) 

In [37]:
import scipy.sparse

#Create scipy csr matrix
def create_sparse_matrix(sessions_df, column='history', shape=None):
  #flatten
  user_ids = []
  item_ids = []
  for idx, row in sessions_df.iterrows():
    items = row[column]
    user = row['user_id']
    user_ids.extend([user] * len(items))
    item_ids.extend(items)
  #create csr matrix
  values = np.ones(len(user_ids))
  matrix = scipy.sparse.csr_matrix((values, (user_ids, item_ids)), shape=shape, dtype=np.int32)
  return matrix


shape = (interactions_df_processed['user_id'].max() +1,  interactions_df_processed['item_id'].max() +1)
train_x = create_sparse_matrix(sessions_df, column='history', shape=shape)
y_true = create_sparse_matrix(sessions_df, column='future', shape=shape)
print(train_x)

  (1, 2749)	1
  (1, 20568)	1
  (1, 22625)	1
  (1, 22652)	1
  (1, 24277)	1
  (1, 38290)	1
  (1, 39322)	1
  (1, 43962)	1
  (1, 55521)	1
  (1, 59877)	1
  (1, 85188)	1
  (1, 88449)	1
  (2, 267)	1
  (2, 328)	1
  (2, 469)	1
  (2, 511)	1
  (2, 960)	1
  (2, 1027)	1
  (2, 1420)	1
  (2, 1731)	1
  (2, 1789)	1
  (2, 1792)	1
  (2, 2176)	1
  (2, 2177)	1
  (2, 2227)	1
  :	:
  (18245, 71524)	1
  (18245, 72832)	1
  (18245, 73166)	1
  (18245, 73298)	1
  (18245, 73683)	1
  (18245, 74024)	1
  (18245, 74125)	1
  (18245, 74609)	1
  (18245, 75565)	1
  (18245, 75837)	1
  (18245, 76189)	1
  (18245, 77069)	1
  (18245, 77451)	1
  (18245, 78741)	1
  (18245, 79059)	1
  (18245, 79533)	1
  (18245, 82420)	1
  (18245, 84656)	1
  (18245, 84878)	1
  (18245, 85011)	1
  (18245, 85188)	1
  (18245, 86364)	1
  (18245, 86762)	1
  (18245, 86763)	1
  (18245, 87367)	1


In [44]:
#popularity recommender
class Popularity():
    def __init__(self, K=10):
        self.K = K

    def fit(self, X):
        items = list(X.nonzero()[1])
        sorted_scores = Counter(items).most_common()
        self.sorted_scores_ = [
            (item, score / sorted_scores[0][1]) for item, score in sorted_scores
        ]

    def predict(self, X):
        items, values = zip(*self.sorted_scores_[: self.K])

        users = set(X.nonzero()[0])

        U, I, V = [], [], []

        for user in users:
            U.extend([user] * self.K)
            I.extend(items)
            V.extend(values)

        score_matrix = scipy.sparse.csr_matrix((V, (U, I)), shape=X.shape)
        return score_matrix

K = 20
pop = Popularity(K=K)
pop.fit(train_x)
y_pred = pop.predict(train_x)
print(pred_y)

  (1, 2749)	1.0
  (1, 3495)	0.5123400365630713
  (1, 7453)	0.5287934186471663
  (1, 8570)	0.5269652650822669
  (1, 12857)	0.40036563071297987
  (1, 31264)	0.6773308957952467
  (1, 33784)	0.4218464351005484
  (1, 41468)	0.4076782449725777
  (1, 75565)	0.4428702010968921
  (1, 85188)	0.6672760511882998
  (2, 2749)	1.0
  (2, 3495)	0.5123400365630713
  (2, 7453)	0.5287934186471663
  (2, 8570)	0.5269652650822669
  (2, 12857)	0.40036563071297987
  (2, 31264)	0.6773308957952467
  (2, 33784)	0.4218464351005484
  (2, 41468)	0.4076782449725777
  (2, 75565)	0.4428702010968921
  (2, 85188)	0.6672760511882998
  (3, 2749)	1.0
  (3, 3495)	0.5123400365630713
  (3, 7453)	0.5287934186471663
  (3, 8570)	0.5269652650822669
  (3, 12857)	0.40036563071297987
  :	:
  (18243, 31264)	0.6773308957952467
  (18243, 33784)	0.4218464351005484
  (18243, 41468)	0.4076782449725777
  (18243, 75565)	0.4428702010968921
  (18243, 85188)	0.6672760511882998
  (18244, 2749)	1.0
  (18244, 3495)	0.5123400365630713
  (18244, 745

In [45]:
#Evaluate recall@k
#Do elementwise multiplication of top K predicts and true interactions
def sparse_divide_nonzero(a: scipy.sparse.csr_matrix, b: scipy.sparse.csr_matrix) -> scipy.sparse.csr_matrix:
    return a.multiply(sparse_inverse_nonzero(b))

def sparse_inverse_nonzero(a: scipy.sparse.csr_matrix) -> scipy.sparse.csr_matrix:
    inv_a = a.copy()
    inv_a.data = 1 / inv_a.data
    return inv_a

scores = scipy.sparse.lil_matrix(y_pred.shape)
scores[y_pred.multiply(y_true).astype(np.bool)] = 1
scores = scores.tocsr()
scores = sparse_divide_nonzero(scores, scipy.sparse.csr_matrix(y_true.sum(axis=1))).sum(axis=1)
print("recall @ {}: {:.4f}".format(K, scores.mean()))

recall @ 20: 0.0369


# 7. Evaluate qualitatively
Options are:
- **By example**, i.e. show history and recommendations for user, including item metadata and (if possible) an explanation (i.e., because you watched $i_x$ we recommend $i_y$)
- **Plot**, i.e. show plot that summarises recommendations, such as classic plots (i.e. distribution of popular items recommendations, recall versus user session length etc.) 
- **Advanced plots**, i.e. network visualisation or **sankey-diagram** ( https://plotly.com/python/sankey-diagram/)
- **Transparent surrogate model**, i.e. summarise recommender with a decision tree-model fitted on top of the recommendations, i.e. $\%x$ of recommendations are items viewed more than 10 times, $\%y$ of recommendations are viewed more than 10 times and in category $c_1$, etc.

In [None]:
#Display popular items