In [1]:
import os
import sys
import django
from django.db import connection
from tqdm import tqdm
import logging
import pandas as pd
from datetime import datetime
import re

In [2]:
sys.path.insert(0, "/Users/victornguyen/Sites/07.book_management")
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "book_management.settings")
django.setup()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
logger = logging.getLogger('User similarity calculator')

# Import Models
from title.models import Title
from main_site.models import BookSimilarity

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
features = ['author', 'author', 'faculty', 'faculty', 'publisher']

In [4]:
def clean_data(x):
    if isinstance(x, str):
        x = str.lower(x.replace(" ", ""))
        x = x.replace("unknown", " ")
        return x.replace(",", " ")
    else:
        return ''

In [14]:
def create_bag_of_words(x):
    name_temp = re.sub(r"[^a-zA-Z]+", ' ', x['name'])
    name_temp = name_temp.replace(" st","")
    name_temp = name_temp.replace(" nd","")
    name_temp = name_temp.replace(" rd","")
    name_temp = name_temp.replace(" th","")
    name_temp = name_temp.replace(" ed","")
    temp = ''
    temp += str.lower(name_temp)
    temp += str.lower(name_temp)
    temp += str.lower(name_temp)
    for feature in features:
        temp += x[feature + '_clean'] + ' '
    return temp

In [6]:
def load_data():
    db_book_list = Title.objects.all().order_by('id')
    db_book_list = [Title.book_info_as_dict(book) for book in db_book_list]
    db_book_list = pd.DataFrame.from_dict(db_book_list, orient='columns')
    db_book_list['type'] = 'database'
    return db_book_list

In [15]:
logger.info("Calculate item similarity")
logger.info("Load all item")
book_list_df = load_data()
book_list_df.head()

2019-07-19 02:12:22,420 : INFO : Calculate item similarity
2019-07-19 02:12:22,422 : INFO : Load all item


Unnamed: 0,author,faculty,id,isbn,location,name,publisher,year,type
0,Project management Institute,ISE,1,,,A guide to the project management body of know...,"Project Management Institute, Inc.",2008,database
1,"Igor A. Karnovsky, Olga Lebed",CE,2,,,Advanced Methods of Structural Analysis,Springer,2010,database
2,Alan L. Harvey,BT,3,,,Advances in Drug Discovery Techniques,John Wiley & Sons,1998,database
3,Frank K. Reilly,"BA, FN",4,,,Analysis of investment & management of portfol...,South-Western Cengage Learning,2012,database
4,"John S. Lucas, Paul C. Southgate",BT,5,,,Aquaculture: Farming Aquatic Animals and Plant...,John Wiley & Sons,2011,database


In [16]:
logger.info("Save rating into database")
logger.info("Clean data")
# Apply clean_data function to your features.
for feature in features:
    book_list_df[feature + '_clean'] = book_list_df[feature].apply(clean_data)
book_list_df.head()

2019-07-19 02:12:24,176 : INFO : Save rating into database
2019-07-19 02:12:24,180 : INFO : Clean data


Unnamed: 0,author,faculty,id,isbn,location,name,publisher,year,type,author_clean,faculty_clean,publisher_clean
0,Project management Institute,ISE,1,,,A guide to the project management body of know...,"Project Management Institute, Inc.",2008,database,projectmanagementinstitute,ise,projectmanagementinstitute inc.
1,"Igor A. Karnovsky, Olga Lebed",CE,2,,,Advanced Methods of Structural Analysis,Springer,2010,database,igora.karnovsky olgalebed,ce,springer
2,Alan L. Harvey,BT,3,,,Advances in Drug Discovery Techniques,John Wiley & Sons,1998,database,alanl.harvey,bt,johnwiley&sons
3,Frank K. Reilly,"BA, FN",4,,,Analysis of investment & management of portfol...,South-Western Cengage Learning,2012,database,frankk.reilly,ba fn,south-westerncengagelearning
4,"John S. Lucas, Paul C. Southgate",BT,5,,,Aquaculture: Farming Aquatic Animals and Plant...,John Wiley & Sons,2011,database,johns.lucas paulc.southgate,bt,johnwiley&sons


In [17]:
# Create meta bag of words
book_list_df['bag_of_words'] = book_list_df.apply(create_bag_of_words, axis=1)
for index, row in book_list_df.iterrows():
    print(row['bag_of_words'],'\n')
book_list_df.head()

a guide toe project management body of knowledge a guide toe project management body of knowledge a guide toe project management body of knowledge projectmanagementinstitute projectmanagementinstitute ise ise projectmanagementinstitute inc.  

advanced methods of structural analysisadvanced methods of structural analysisadvanced methods of structural analysisigora.karnovsky olgalebed igora.karnovsky olgalebed ce ce springer  

advances in drug discovery techniquesadvances in drug discovery techniquesadvances in drug discovery techniquesalanl.harvey alanl.harvey bt bt johnwiley&sons  

analysis of investment management of portfolios analysis of investment management of portfolios analysis of investment management of portfolios frankk.reilly frankk.reilly ba fn ba fn south-westerncengagelearning  

aquaculture farming aquatic animals and plants aquaculture farming aquatic animals and plants aquaculture farming aquatic animals and plants johns.lucas paulc.southgate johns.lucas paulc.south

Unnamed: 0,author,faculty,id,isbn,location,name,publisher,year,type,author_clean,faculty_clean,publisher_clean,bag_of_words
0,Project management Institute,ISE,1,,,A guide to the project management body of know...,"Project Management Institute, Inc.",2008,database,projectmanagementinstitute,ise,projectmanagementinstitute inc.,a guide toe project management body of knowled...
1,"Igor A. Karnovsky, Olga Lebed",CE,2,,,Advanced Methods of Structural Analysis,Springer,2010,database,igora.karnovsky olgalebed,ce,springer,advanced methods of structural analysisadvance...
2,Alan L. Harvey,BT,3,,,Advances in Drug Discovery Techniques,John Wiley & Sons,1998,database,alanl.harvey,bt,johnwiley&sons,advances in drug discovery techniquesadvances ...
3,Frank K. Reilly,"BA, FN",4,,,Analysis of investment & management of portfol...,South-Western Cengage Learning,2012,database,frankk.reilly,ba fn,south-westerncengagelearning,analysis of investment management of portfolio...
4,"John S. Lucas, Paul C. Southgate",BT,5,,,Aquaculture: Farming Aquatic Animals and Plant...,John Wiley & Sons,2011,database,johns.lucas paulc.southgate,bt,johnwiley&sons,aquaculture farming aquatic animals and plants...


In [None]:
# Convert word to vector and calculate similarity
# instantiating and generating the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(book_list_df['bag_of_words'])
# generating the cosine similarity matrix base on cosine_sim
cosine_sim_matrix = cosine_similarity(count_matrix, count_matrix)