In [1]:
# Import libraries
import pandas as pd
import plotly.express as px
from pathlib import Path
import numpy as np
import re
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix

In [2]:
# Load CSV
books_path = Path('Resources/books_cleaned.csv')
ratings_path = Path('Resources/ratings.csv')
users_path = Path('Resources/users_cleaned.csv')
books_df = pd.read_csv(books_path, index_col=0)
ratings_df = pd.read_csv(ratings_path, sep=';', on_bad_lines='warn', encoding='latin-1', index_col=False)
users_df = pd.read_csv(users_path, index_col=0)

In [3]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
# Check head
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
books_df.shape

(271360, 8)

In [6]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
type(ratings_df.iloc[0, 0])

numpy.int64

In [8]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [9]:
type(users_df.iloc[0, 0])

numpy.int64

In [10]:
type(users_df.iloc[0, 1])

str

In [11]:
users_df.iloc[1, 1]

'stockton, california, usa'

In [12]:
type(users_df.index[0])

numpy.int64

In [13]:
 #users_df.dropna(inplace=True)

In [14]:
users_df.shape

(278858, 3)

In [15]:
ratings_df['Book-Rating'].max()

10

In [16]:
ratings_df['Book-Rating'].min()

0

In [17]:
ratings_df.set_index('User-ID', inplace=True)
ratings_df.head()

Unnamed: 0_level_0,ISBN,Book-Rating
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
276725,034545104X,0
276726,0155061224,5
276727,0446520802,0
276729,052165615X,3
276729,0521795028,6


In [18]:
users_df.set_index('User-ID', inplace=True)
users_df.head()

Unnamed: 0_level_0,Location,Age
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"nyc, new york, usa",
2,"stockton, california, usa",18.0
3,"moscow, yukon territory, russia",
4,"porto, v.n.gaia, portugal",17.0
5,"farnborough, hants, united kingdom",


In [19]:
n_books = len(books_df)
n_ratings = len(ratings_df)
n_users = len(users_df)

In [25]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {books_df}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per book: {round(n_ratings/n_books, 2)}")

Number of ratings: 1149780
Number of unique movieId's:               ISBN                                         Book-Title  \
0       0195153448                                Classical Mythology   
1       0002005018                                       Clara Callan   
2       0060973129                               Decision in Normandy   
3       0374157065  Flu: The Story of the Great Influenza Pandemic...   
4       0393045218                             The Mummies of Urumchi   
...            ...                                                ...   
271355  0440400988                         There's a Bat in Bunk Five   
271356  0525447644                            From One to One Hundred   
271357  006008667X  Lily Dale : The True Story of the Town that Ta...   
271358  0192126040                        Republic (World's Classics)   
271359  0767409752  A Guided Tour of Rene Descartes' Meditations o...   

                 Book-Author  Year-Of-Publication  \
0         Mark 