In [116]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
import pickle

client = MongoClient('mongodb://localhost:27017/')
db = client['movie-web']
col_movies = db['movies']
cur_movies = col_movies.find()
col_actormovies = db['actormovies']
cur_actormovies = col_actormovies.find()
col_actors = db['actors']
cur_actors = col_actors.find()
list_movies = []
list_actormovies = []
list_actors = []
for i in cur_movies:
    list_movies.append(i)
for i in cur_actormovies:
    list_actormovies.append(i)
for i in cur_actors:
    list_actors.append(i)
movies = pd.DataFrame(list_movies)
actormovies = pd.DataFrame(list_actormovies)
actors = pd.DataFrame(list_actors)
movies = movies.drop(['img', 'imgTitle', 'createdAt', 'updatedAt', 'trailer', 'year', 'limit', 'isSeries', '__v', 'numRate', 'rate', 'duration', 'countView', 'epNum'], axis=1)

df = pd.merge(actormovies[['movie', 'actor', 'character']],actors[['_id','name']],left_on='actor', right_on='_id', how='left').drop(columns = ['_id'])
df2 = pd.merge(movies, df, left_on='_id', right_on='movie', how='left').drop(columns = ['movie', 'actor'])
df2 = df2.fillna('')
df2 = df2.groupby('_id').agg({'title':'first', 
                              'desc':'first',
                              'genre':'first',
                             'character': ' '.join, 
                             'name': ' '.join,  }).reset_index()

df2['tags'] = df2['title'] + ' ' + df2['desc'] + ' ' + df2['genre'] + ' ' + df2['character'] + ' ' + df2['name']

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df2['tags'])

with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(tfidf, fin)
    pickle.dump(tfidf_matrix, fin)
    pickle.dump(df2['_id'], fin)

In [117]:
from sklearn.metrics.pairwise import linear_kernel
import pickle
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

with open('vectorizer.pk', 'rb') as fin:
    vectorizer = pickle.load(fin)
    matrix_tfidf = pickle.load(fin)
    df = pickle.load(fin)
df

0    633dc3b4f611a788116b43d8
1    633dc52cf611a788116b43dc
2    633dd3e4f611a788116b43e7
3    6354e362c834f95698ef0221
4    6354ebb6c834f95698ef02ca
5    6354ec56c834f95698ef02cd
6    637a48a12d686b1096842328
Name: _id, dtype: object

In [5]:
# def get_recommendations(title):

#     df3 = pd.concat([df2['tags'], pd.Series([title])], ignore_index = True)
    
#     tfidf = TfidfVectorizer(stop_words='english')

#     tfidf_matrix = tfidf.fit_transform(df3)
    
#     cosine_sim = linear_kernel(tfidf_matrix[df3.index.max()], tfidf_matrix)
    
#     sim_scores = list(enumerate(cosine_sim[0]))
#     sim_scores = sim_scores[:-1]
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     sim_scores = sim_scores[0:10]

#     movie_indices = [i[0] for i in sim_scores]

#     return df2['title'].iloc[movie_indices].astype('string')

# list(get_recommendations('hoạt hình'))
from sklearn.metrics.pairwise import linear_kernel
import pickle
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

with open('vectorizer.pk', 'rb') as fin:
    vectorizer = pickle.load(fin)
    matrix_tfidf = pickle.load(fin)
    df = pickle.load(fin)

def get_recommendations(title):
    m_new = sparse.vstack((matrix_tfidf, vectorizer.transform([title])))
    svd = TruncatedSVD(n_components=10)
    svd.fit(m_new)
    svd_tfidf_vector = svd.transform(m_new)
    svd_query = np.reshape(svd_tfidf_vector[-1],(1,svd_tfidf_vector[-1].size))
    cosine_sim = linear_kernel(svd_query, svd_tfidf_vector)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sim_scores[:-1]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:10]
    movie_indices = [i[0] for i in sim_scores]
    return df2['title'].iloc[movie_indices].astype('string')
list(get_recommendations('Benedict'))

['Doctor Strange',
 'Doctor Strange in Multiverse of Madness',
 'Death on the Nile',
 'Nope',
 'Stranger Things 4',
 'Inside Out',
 'Coco']

In [119]:
userratings = db['userratings']
cursor = userratings.find()
list_cur = []
for i in cursor:
    list_cur.append(i)
ratings = pd.DataFrame(list_cur)
ratings = ratings.drop(['_id', '__v', 'createdAt', 'updatedAt'], axis=1)
ratings

Unnamed: 0,movie,user,rating
0,633dc3b4f611a788116b43d8,633e8ba2bc3a40391db45a5c,5
1,633dc3b4f611a788116b43d8,633e900bbc3a40391db45a7f,5
2,633dc52cf611a788116b43dc,633b30e4edfee7840d1006c6,4
3,633dc52cf611a788116b43dc,633e8ba2bc3a40391db45a5c,5
4,633dd3e4f611a788116b43e7,633e8ba2bc3a40391db45a5c,2
5,633dd3e4f611a788116b43e7,633b30e4edfee7840d1006c6,2
6,633dc52cf611a788116b43dc,633e900bbc3a40391db45a7f,3
7,6354ebb6c834f95698ef02ca,633e8ba2bc3a40391db45a5c,2
8,6354ec56c834f95698ef02cd,633e8ba2bc3a40391db45a5c,5


In [120]:
def get_items_rated_by_user(user_id):
    y = ratings.iloc[:,1].astype('string')
    ids = np.where(y == user_id)[0]
    item_ids = ratings.iloc[ids, 0].astype('string')
    scores = ratings.iloc[ids, 2]
    return (item_ids, scores)
get_items_rated_by_user('633e8ba2bc3a40391db45a5c')

(0    633dc3b4f611a788116b43d8
 3    633dc52cf611a788116b43dc
 4    633dd3e4f611a788116b43e7
 7    6354ebb6c834f95698ef02ca
 8    6354ec56c834f95698ef02cd
 Name: movie, dtype: string,
 0    5
 3    5
 4    2
 7    2
 8    5
 Name: rating, dtype: int64)

In [122]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
svd = TruncatedSVD(n_components=10)
svd.fit(matrix_tfidf)
svd_tfidf_vector = svd.transform(matrix_tfidf)
d = svd_tfidf_vector.shape[1] 
users = ratings['user'].astype('string').unique()
W = np.zeros((d, users.size))
b = np.zeros((1, users.size))
# df = pd.DataFrame(df)
for idx, n in enumerate(users):    
    ids, scores = get_items_rated_by_user(n)
    clf = Ridge(alpha = 1, fit_intercept  = True)
    ids_items = df[df.astype('string').isin(ids)].index.tolist()
    Xhat = svd_tfidf_vector[ids_items, :]
    clf.fit(Xhat, scores) 
    W[:, idx] = clf.coef_
    b[0, idx] = clf.intercept_
print(b)
Yhat = svd_tfidf_vector.dot(W) + b
# print(Yhat)
# Yhat = svd_tfidf_vector.dot(W) + b
Yhat
# with open('model_content_based.pk', 'wb') as fin:
#     pickle.dump(Yhat, fin)

[[3.8106122 4.        3.       ]]


array([[4.4053061 , 4.5       , 3.        ],
       [4.4053061 , 3.5       , 3.5       ],
       [2.9053061 , 4.        , 2.5       ],
       [3.90426896, 4.07874366, 3.        ],
       [2.96929883, 4.        , 3.        ],
       [4.31478287, 4.        , 3.        ],
       [3.17010554, 4.        , 2.64624857]])

In [19]:
items = ratings['movie'].astype('string').unique()
def recommend(user_id, top):
    item = {'userId': None, 'movieId': None, 'score': None}
    list_items = []
    def take_score(elem):
        return elem['score']
    items_rated_by_user, score = get_items_rated_by_user(user_id)
    for idx, n in enumerate(df.astype('string')): 
        if n not in items_rated_by_user.tolist():
            item['userId'] = user_id
            item['movieId'] = n
            item['score'] = Yhat[idx, np.where(users == user_id)[0][0]]
            list_items.append(item.copy())  
    sorted_items = sorted(list_items, key=take_score, reverse=True)
    sorted_items = sorted_items[:top]
    return sorted_items
df3 = pd.DataFrame(columns=['userId', 'movieId', 'score'])
for u in users:
    df3 = pd.concat([df3, pd.DataFrame(recommend(u, 10))])
df3

Unnamed: 0,userId,movieId,score
0,633e8ba2bc3a40391db45a5c,6354e362c834f95698ef0221,3.826332
1,633e8ba2bc3a40391db45a5c,637a48a12d686b1096842328,3.643331
0,633e900bbc3a40391db45a7f,6354e362c834f95698ef0221,4.019686
1,633e900bbc3a40391db45a7f,633dd3e4f611a788116b43e7,4.0
2,633e900bbc3a40391db45a7f,6354ebb6c834f95698ef02ca,4.0
3,633e900bbc3a40391db45a7f,6354ec56c834f95698ef02cd,4.0
4,633e900bbc3a40391db45a7f,637a48a12d686b1096842328,4.0
0,633b30e4edfee7840d1006c6,633dc3b4f611a788116b43d8,3.0
1,633b30e4edfee7840d1006c6,6354e362c834f95698ef0221,3.0
2,633b30e4edfee7840d1006c6,6354ebb6c834f95698ef02ca,3.0


In [15]:
items = ratings['movie'].astype('string').unique()
a = np.zeros((df.size,))
recommended_items = []
items_rated_by_user, score = get_items_rated_by_user('633e900bbc3a40391db45a7f')
item = {'userId': None, 'movieId': None, 'score': None}
list_items = []
def take_score(elem):
    return elem['score']
for idx, n in enumerate(df.astype('string')): 
    if n not in items_rated_by_user.tolist():
        item['userId'] = '633e900bbc3a40391db45a7f'
        item['movieId'] = n
        item['score'] = Yhat[idx, np.where(users == '633e900bbc3a40391db45a7f')[0][0]]
        a[idx] = Yhat[idx, np.where(users == '633e900bbc3a40391db45a7f')[0][0]]
        list_items.append(item.copy())  
sorted_items = sorted(list_items, key=take_score, reverse=True)
sorted_items = sorted_items[:10]
recommended_items = np.argsort(a)[-10:][::-1]
print(recommended_items)
df2['title'].iloc[recommended_items]
print(sorted_items)
# df3 = pd.DataFrame(columns=['userId', 'movieId', 'similar'])
# users = df_std.columns.tolist()
# for u in users:
#     df = pd.concat([df, pd.DataFrame(recommend_top(u, 10))])

[3 6 5 4 2 1 0]
[{'userId': '633e900bbc3a40391db45a7f', 'movieId': '6354e362c834f95698ef0221', 'score': 4.019685915439231}, {'userId': '633e900bbc3a40391db45a7f', 'movieId': '633dd3e4f611a788116b43e7', 'score': 4.0}, {'userId': '633e900bbc3a40391db45a7f', 'movieId': '6354ebb6c834f95698ef02ca', 'score': 4.0}, {'userId': '633e900bbc3a40391db45a7f', 'movieId': '6354ec56c834f95698ef02cd', 'score': 4.0}, {'userId': '633e900bbc3a40391db45a7f', 'movieId': '637a48a12d686b1096842328', 'score': 4.0}]


In [23]:
import sys
import numpy as np
import pandas as pd

recommend = pd.read_csv('recommendCB.csv')
ids = np.where(recommend.iloc[:, 0] == '633e900bbc3a40391db45a7f')[0]
movies = recommend.iloc[ids, 1].tolist()
# sys.stdout.write(str(movies))
str(movies)

"['6354e362c834f95698ef0221', '633dd3e4f611a788116b43e7', '6354ebb6c834f95698ef02ca', '6354ec56c834f95698ef02cd', '637a48a12d686b1096842328']"

In [139]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
from underthesea import word_tokenize
import pickle
from bson import ObjectId
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from scipy import sparse
from sklearn.metrics.pairwise import linear_kernel


class CB:
    def __init__(self):
        client = MongoClient('mongodb://localhost:27017/')
        db = client['movie-web']
        self.col_m = db.movies
        self.col_am = db.actormovies
        self.col_a = db.actors
        self.col_ur = db.userratings
        
    def get_items_rated_by_user(self, u):
        y = self.ratings.iloc[:,1]
        ids = np.where(y == u)[0]
        item_ids = self.ratings.iloc[ids, 0]
        scores = self.ratings.iloc[ids, 2]
        return (item_ids, scores)
    
    def fit(self):
        cur_m = self.col_m.find({}, {"title": 1, "desc": 1, "genre": 1})
        cur_am = self.col_am.find({}, {"movie": 1, "actor": 1, "character": 1, "_id": 0})
        cur_a = self.col_a.find({}, {"name": 1})
        cur_ur = self.col_ur.find({}, {"movie": 1, "user": 1, "rating": 1, "_id": 0})
        list_ur = []
        list_m = []
        list_am = []
        list_a = []
        for i in cur_ur:
            list_ur.append(i)
        self.ratings = pd.DataFrame(list_ur)
        
        for i in cur_m:
            list_m.append(i)
        for i in cur_am:
            list_am.append(i)
        for i in cur_a:
            list_a.append(i)
        m = pd.DataFrame(list_m)
        am= pd.DataFrame(list_am)
        a = pd.DataFrame(list_a)
        
        df = pd.merge(am, a, left_on='actor', right_on='_id', how='left').drop(columns = ['_id', 'actor'])
        df2 = pd.merge(m, df, left_on='_id', right_on='movie', how='left').drop(columns = ['movie'])
        df2 = df2.fillna('')
        df2 = df2.groupby('_id').agg({'title':'first', 
                              'desc':'first',
                              'genre':'first',
                              'character': ' '.join, 
                              'name': ' '.join,  }).reset_index()

        df2['tags'] = (df2['title'] + ' ' + df2['desc'] + ' ' + df2['genre'] + ' ' + df2['character'] 
                       + ' ' + df2['name'])
        
        tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = tfidf.fit_transform(df2['tags'])
#         df2['tags'] = df2.apply(lambda row: word_tokenize(row['tags'], format="text"), axis=1)
#         stop_words
        svd = TruncatedSVD(n_components=10)
        vectors = svd.fit_transform(tfidf_matrix)
        d = vectors.shape[1] 
        self.users = self.ratings['user'].unique()
        self.movies = df2['_id']
        W = np.zeros((d, self.users.size))
        b = np.zeros((1, self.users.size))
        
        for n, u in enumerate(self.users):    
            ids, scores = self.get_items_rated_by_user(u)
            clf = Ridge(alpha = 1, fit_intercept  = True)
            idx = df2[df2['_id'].isin(ids)].index.tolist()
            ids_items = df2[df.isin(ids)].index.tolist()
            Xhat = vectors[idx, :]
            clf.fit(Xhat, scores) 
            W[:, n] = clf.coef_
            b[0, n] = clf.intercept_

        Yhat = vectors.dot(W) + b
        return Yhat

    def recommend_top(self, u, top):
        if u in self.users:
            item = {'movieId': None, 'score': None}
            list_items = []
            def take_score(elem):
                return elem['score']
            ids, score = self.get_items_rated_by_user(u)
            for n, m in enumerate(self.movies): 
                if m not in ids.tolist():
                    item['movieId'] = m
                    item['score'] = Yhat[n, np.where(self.users == u)[0][0]]
                    list_items.append(item.copy())  
            sorted_items = sorted(list_items, key=take_score, reverse=True)
            sorted_items = sorted_items[:top]
            result = []
            for i in sorted_items:
                result.append(str(i['movieId']))
            return result 
        return []
    
    def search(self, key):
        tfidf = TfidfVectorizer(stop_words='english')
        m_new = sparse.vstack((self.tfidf_matrix, tfidf.fit_transform([key])))
        svd = TruncatedSVD(n_components=10)
        svd_tfidf_vector = svd.fit_transform(m_new)
        svd_query = np.reshape(svd_tfidf_vector[-1],(1,svd_tfidf_vector[-1].size))
        cosine_sim = linear_kernel(svd_query, svd_tfidf_vector)
        sim_scores = list(enumerate(cosine_sim[0]))
        sim_scores = sim_scores[:-1]
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[0:10]
        movie_indices = [i[0] for i in sim_scores]
        return self.movies.iloc[movie_indices].astype('string')

In [140]:
cb = CB()
cb.fit()
cb.search('Doctor Strange')


NotFittedError: The TF-IDF vectorizer is not fitted