# Content Based System

In [1]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm 

In [2]:
# constants
PATH = '../data/data.csv'

## Import Data

In [3]:
df = pd.read_csv(PATH)
df.shape

(300000, 9)

In [4]:
df.head()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publish_year,book_price
0,45,446,2,2272,553,4,10,2004,23
1,2413,23,7,3515,318,2,38,2015,15
2,2095,431,8,18547,208,3,48,2000,23
3,2431,211,4,19675,340,10,34,2000,117
4,2871,78,3,23453,610,2,24,2012,111


## Book Recommendation

In [5]:
def normalize(data):
    '''
    This function will normalize the input data to be between 0 and 1
    
    params:
        data (List) : The list of values you want to normalize
    
    returns:
        The input data normalized between 0 and 1
    '''
    min_val = min(data)
    if min_val < 0:
        data = [x + abs(min_val) for x in data]
    max_val = max(data)
    return [x/max_val for x in data]

In [6]:
# normalize the num_pages, ratings, price columns
df['num_pages_norm'] = normalize(df['num_pages'].values)
df['book_rating_norm'] = normalize(df['book_rating'].values)
df['book_price_norm'] = normalize(df['book_price'].values)

In [7]:
def ohe(df, enc_col):
    '''
    This function will one hot encode the specified column and add it back
    onto the input dataframe
    
    params:
        df (DataFrame) : The dataframe you wish for the results to be appended to
        enc_col (String) : The column you want to OHE
    
    returns:
        The OHE columns added onto the input dataframe
    '''
    
    ohe_df = pd.get_dummies(df[enc_col])
    ohe_cols = ohe_df.columns
    ohe_df.reset_index(drop = True, inplace = True)
    return pd.concat([df, ohe_df], axis = 1)

In [8]:
# OHE on publish_year and genre
df = ohe(df = df, enc_col = 'publish_year')
df = ohe(df = df, enc_col = 'book_genre')

In [9]:
# drop redundant columns
cols = ['publish_year', 'book_genre', 'num_pages', 'book_rating', 'book_price']
df.drop(columns = cols, inplace = True)
df.set_index('book_id', inplace = True)

In [10]:
def cosine_sim(v1,v2):
    '''
    This function will calculate the cosine similarity between two vectors
    '''
    return dot(v1,v2)/(norm(v1)*norm(v2))

In [11]:
def recommend(df, book_id, n_rec):
    """
    df (dataframe): The dataframe
    song_id (string): Representing the song name
    n_rec (int): amount of rec user wants
    """
    inputVec = df.loc[book_id].values
    df['sim']= df.apply(lambda x: cosine_sim(inputVec,x.values), axis=1)
    return df.nlargest(columns='sim',n=n_rec)

In [12]:
# run on a sample for quick results
t = df.sample(1000).copy()
recommend(t, t.index[0], 5)

Unnamed: 0_level_0,author_id,reader_id,publisher_id,num_pages_norm,book_rating_norm,book_price_norm,2000,2001,2002,2003,...,2,3,4,5,6,7,8,9,10,sim
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1783,146,697,12,0.437143,0.7,0.48,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.0
2407,162,756,11,0.511429,0.2,0.51,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.999982
1329,268,1256,27,0.391429,0.7,0.57,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.999982
1979,400,1831,25,0.375714,1.0,0.82,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.999954
2047,215,983,12,0.565714,0.2,0.77,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.999947
