In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
x = ["hello hello my name is Diana", "hello my name is Zarko"]

In [None]:
vec = CountVectorizer() # CountVectorizer() is used to convert a collection of text documents to a matrix of token counts
x_cv = vec.fit_transform(x)

In [27]:
vec.get_feature_names_out()

array(['diana', 'hello', 'is', 'my', 'name', 'zarko'], dtype=object)

In [28]:
x_cv.toarray()

array([[1, 2, 1, 1, 1, 0],
       [0, 1, 1, 1, 1, 1]])

In [29]:
pd.DataFrame(x_cv.toarray(), columns=vec.get_feature_names_out())

Unnamed: 0,diana,hello,is,my,name,zarko
0,1,2,1,1,1,0
1,0,1,1,1,1,1


In [30]:
vec = TfidfVectorizer()
x_tfidf = vec.fit_transform(x)

pd.DataFrame(data = x_tfidf.toarray(),
             columns = vec.get_feature_names_out())



Unnamed: 0,diana,hello,is,my,name,zarko
0,0.469132,0.667582,0.333791,0.333791,0.333791,0.0
1,0.0,0.40909,0.40909,0.40909,0.40909,0.574962


In [31]:
cosine_similarity(x_tfidf)

array([[1.        , 0.68275315],
       [0.68275315, 1.        ]])

In [32]:
df = pd.read_csv('/Users/zeal.v/Desktop/Computer-Vision_Project_Masterschool/DATA/Books.csv',
                 on_bad_lines='skip',
                 sep=';',)
df.head()   

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [42]:
user_ratings = pd.read_csv('/Users/zeal.v/Desktop/Computer-Vision_Project_Masterschool/DATA/users-ratings.csv')
user_ratings.head()

Unnamed: 0,User-ID,Age,ISBN,Rating
0,243,,60915544,10
1,243,,60977493,7
2,243,,156006529,0
3,243,,316096199,0
4,243,,316601950,9


In [34]:
df = df[df['ISBN'].isin(user_ratings['ISBN'])]

In [35]:
df.dropna(subset=['Title', 'Author'], inplace=True)

In [36]:
df.drop_duplicates(subset=['Title', 'Author'], inplace=True)

In [37]:
df['Title'] = df['Title'].apply(lambda x: x.lower())
df['Author'] = df['Author'].apply(lambda x: x.lower())

In [38]:
df['text'] = df['Title'] + ' ' + df['Author']

In [39]:
df.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
18,440234743,the testament,john grisham,1999,Dell,the testament john grisham
19,452264464,beloved (plume contemporary fiction),toni morrison,1994,Plume,beloved (plume contemporary fiction) toni morr...
26,971880107,wild animus,rich shapero,2004,Too Far,wild animus rich shapero
27,345402871,airframe,michael crichton,1997,Ballantine Books,airframe michael crichton
28,345417623,timeline,michael crichton,2000,Ballantine Books,timeline michael crichton


In [40]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])

In [41]:
tfidf_matrix.shape

(1798, 3143)

In [43]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [44]:
def get_recommendations(isbn, cosine_sim = cosine_sim, top_n = 10):
  idx = df[df['ISBN']==isbn].index
  sim_scores = cosine_sim[idx][0]

  sim_scores = sorted(list(enumerate(sim_scores)), key = lambda x: x[1], reverse=True) #tuples (index, score)

  sim_scores = sim_scores[1:top_n+1]
  book_indicies = [i[0] for i in sim_scores]

  return df.iloc[book_indicies]

In [45]:
df[df['ISBN']=='0440234743']

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
18,440234743,the testament,john grisham,1999,Dell,the testament john grisham


In [48]:
def get_recommendations(isbn, cosine_sim=cosine_sim, top_n=10):
    idx = df[df['ISBN'] == isbn].index[0]
    sim_scores = cosine_sim[idx]

    sim_scores = sorted(list(enumerate(sim_scores)), key=lambda x: x[1], reverse=True)
    sim_scores = [score for score in sim_scores if df.iloc[score[0]]['ISBN'] != isbn]

    sim_scores = sim_scores[:top_n]
    book_indices = [i[0] for i in sim_scores]

    return df.iloc[book_indices]

get_recommendations('0440234743')

Unnamed: 0,ISBN,Title,Author,Year,Publisher,text
73,140067477,the tao of pooh,benjamin hoff,1983,Penguin Books,the tao of pooh benjamin hoff
19,452264464,beloved (plume contemporary fiction),toni morrison,1994,Plume,beloved (plume contemporary fiction) toni morr...
26,971880107,wild animus,rich shapero,2004,Too Far,wild animus rich shapero
27,345402871,airframe,michael crichton,1997,Ballantine Books,airframe michael crichton
28,345417623,timeline,michael crichton,2000,Ballantine Books,timeline michael crichton
37,446310786,to kill a mockingbird,harper lee,1988,Little Brown & Company,to kill a mockingbird harper lee
38,449005615,seabiscuit: an american legend,laura hillenbrand,2002,Ballantine Books,seabiscuit: an american legend laura hillenbrand
39,60168013,pigs in heaven,barbara kingsolver,1993,Harpercollins,pigs in heaven barbara kingsolver
45,671888587,i'll be seeing you,mary higgins clark,1994,Pocket,i'll be seeing you mary higgins clark
46,553582747,from the corner of his eye,dean koontz,2001,Bantam Books,from the corner of his eye dean koontz
