In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
import pickle

client = MongoClient('mongodb://localhost:27017/')
db = client['movie-web']
col_movies = db['movies']
cur_movies = col_movies.find()
col_actormovies = db['actormovies']
cur_actormovies = col_actormovies.find()
col_actors = db['actors']
cur_actors = col_actors.find()
list_movies = []
list_actormovies = []
list_actors = []
for i in cur_movies:
    list_movies.append(i)
for i in cur_actormovies:
    list_actormovies.append(i)
for i in cur_actors:
    list_actors.append(i)
movies = pd.DataFrame(list_movies)
actormovies = pd.DataFrame(list_actormovies)
actors = pd.DataFrame(list_actors)
movies = movies.drop(['img', 'imgTitle', 'createdAt', 'updatedAt', 'trailer', 'year', 'limit', 'isSeries', '__v', 'numRate', 'rate', 'duration', 'countView', 'epNum'], axis=1)

df = pd.merge(actormovies[['movie', 'actor', 'character']],actors[['_id','name']],left_on='actor', right_on='_id', how='left').drop(columns = ['_id'])
df2 = pd.merge(movies, df, left_on='_id', right_on='movie', how='left').drop(columns = ['movie', 'actor'])
df2 = df2.fillna('')
df2 = df2.groupby('_id').agg({'title':'first', 
                              'desc':'first',
                              'genre':'first',
                             'character': ' '.join, 
                             'name': ' '.join,  }).reset_index()

df2['tags'] = df2['title'] + ' ' + df2['desc'] + ' ' + df2['genre'] + ' ' + df2['character'] + ' ' + df2['name']

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df2['tags'])

with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(tfidf, fin)
    pickle.dump(tfidf_matrix, fin)
    pickle.dump(df2['_id'], fin)

In [2]:
# def get_recommendations(title):

#     df3 = pd.concat([df2['tags'], pd.Series([title])], ignore_index = True)
    
#     tfidf = TfidfVectorizer(stop_words='english')

#     tfidf_matrix = tfidf.fit_transform(df3)
    
#     cosine_sim = linear_kernel(tfidf_matrix[df3.index.max()], tfidf_matrix)
    
#     sim_scores = list(enumerate(cosine_sim[0]))
#     sim_scores = sim_scores[:-1]
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     sim_scores = sim_scores[0:10]

#     movie_indices = [i[0] for i in sim_scores]

#     return df2['title'].iloc[movie_indices].astype('string')

# list(get_recommendations('hoạt hình'))
from sklearn.metrics.pairwise import linear_kernel
import pickle
from sklearn.decomposition import TruncatedSVD
from scipy import sparse

with open('vectorizer.pk', 'rb') as fin:
    vectorizer = pickle.load(fin)
    matrix_tfidf = pickle.load(fin)
    df = pickle.load(fin)

def get_recommendations(title):
    m_new = sparse.vstack((matrix_tfidf, vectorizer.transform([title])))
    svd = TruncatedSVD(n_components=10)
    svd.fit(m_new)
    svd_tfidf_vector = svd.transform(m_new)
    svd_query = np.reshape(svd_tfidf_vector[-1],(1,svd_tfidf_vector[-1].size))
    cosine_sim = linear_kernel(svd_query, svd_tfidf_vector)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sim_scores[:-1]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:10]
    movie_indices = [i[0] for i in sim_scores]
    return df2['title'].iloc[movie_indices].astype('string')
list(get_recommendations('Phù thủy Stranger'))

['Doctor Strange in Multiverse of Madness',
 'Stranger Things 4',
 'Inside Out',
 'Death on the Nile',
 'Coco',
 'Nope']

In [3]:
userratings = db['userratings']
cursor = userratings.find()
list_cur = []
for i in cursor:
    list_cur.append(i)
ratings = pd.DataFrame(list_cur)
ratings = ratings.drop(['_id', '__v', 'createdAt', 'updatedAt'], axis=1)
ratings

Unnamed: 0,movie,user,rating
0,633dc3b4f611a788116b43d8,633e8ba2bc3a40391db45a5c,5
1,633dc3b4f611a788116b43d8,633e900bbc3a40391db45a7f,5
2,633dc52cf611a788116b43dc,633b30e4edfee7840d1006c6,4
3,633dc52cf611a788116b43dc,633e8ba2bc3a40391db45a5c,5
4,633dd3e4f611a788116b43e7,633e8ba2bc3a40391db45a5c,2
5,633dd3e4f611a788116b43e7,633b30e4edfee7840d1006c6,2
6,633dc52cf611a788116b43dc,633e900bbc3a40391db45a7f,3
7,6354ebb6c834f95698ef02ca,633e8ba2bc3a40391db45a5c,2
8,6354ec56c834f95698ef02cd,633e8ba2bc3a40391db45a5c,5


In [6]:
def get_items_rated_by_user(user_id):
    y = ratings.iloc[:,1].astype('string')
    ids = np.where(y == user_id)[0]
    item_ids = ratings.iloc[ids, 0].astype('string')
    scores = ratings.iloc[ids, 2]
    return (item_ids, scores)
get_items_rated_by_user('633e8ba2bc3a40391db45a5c')

[0 3 4 7 8]


(0    633dc3b4f611a788116b43d8
 3    633dc52cf611a788116b43dc
 4    633dd3e4f611a788116b43e7
 7    6354ebb6c834f95698ef02ca
 8    6354ec56c834f95698ef02cd
 Name: movie, dtype: string,
 0    5
 3    5
 4    2
 7    2
 8    5
 Name: rating, dtype: int64)

In [10]:
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
        tfidf = transformer.fit_transform(self.X_train.tolist()).toarray()
        d = tfidf.shape[1] # data dimension
        W = np.zeros((d, self.n_users))
        b = np.zeros((1, self.n_users))
        for n in range(self.n_users):    
            ids, scores = get_items_rated_by_user(self.Y, n)
            clf = Ridge(alpha= self.lamda, fit_intercept  = True)
            Xhat = tfidf[ids, :]

            clf.fit(Xhat, scores) 
            W[:, n] = clf.coef_
            b[0, n] = clf.intercept_
        self.Yhat = tfidf.dot(W) + b

Int64Index([1, 6], dtype='int64')

In [7]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
svd = TruncatedSVD(n_components=10)
svd.fit(matrix_tfidf)
svd_tfidf_vector = svd.transform(matrix_tfidf)
d = svd_tfidf_vector.shape[1] 
users = ratings['user'].astype('string').unique()
W = np.zeros((d, users.size))
b = np.zeros((1, users.size))
df = pd.DataFrame(df)
for n in users:    
    ids, scores = get_items_rated_by_user(n)
    ids = ids.iloc[:, 1]
    clf = Ridge(alpha = 7, fit_intercept  = True)
#     ids_items = df.index[df['_id'].astype('string') == ids].tolist()
#     Xhat = svd_tfidf_vector[ids_items, :]
    print(ids)
    

[0 3 4 7 8]
0    633dc3b4f611a788116b43d8
3    633dc52cf611a788116b43dc
4    633dd3e4f611a788116b43e7
7    6354ebb6c834f95698ef02ca
8    6354ec56c834f95698ef02cd
Name: movie, dtype: string
[1 6]
1    633dc3b4f611a788116b43d8
6    633dc52cf611a788116b43dc
Name: movie, dtype: string
[2 5]
2    633dc52cf611a788116b43dc
5    633dd3e4f611a788116b43e7
Name: movie, dtype: string


In [None]:
df.

Unnamed: 0,_id
0,633dc3b4f611a788116b43d8
1,633dc52cf611a788116b43dc
2,633dd3e4f611a788116b43e7
3,6354e362c834f95698ef0221
4,6354ebb6c834f95698ef02ca
5,6354ec56c834f95698ef02cd
