In [None]:
#Tensorflow library. Used to implement machine learning models
import tensorflow.compat.v1 as tf
# Disable the default activate eager execution in TF v1.0
tf.disable_eager_execution()
#Numpy contains helpful functions for efficient mathematical calculations
import numpy as np
#Dataframe manipulation library
import pandas as pd
#Graph plotting library
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Loading in the movies dataset
movies_df =  pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, engine='python',encoding='latin-1')
movies_df.columns = ['movieId', 'title', 'genres']
movies_df.head()

In [None]:
#Loading in the ratings dataset
ratings_df = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, engine='python',encoding='latin-1')
ratings_df.columns = ['userId','movieId','rating','timestamp']
ratings_df.head()

In [None]:
user_rating_df = ratings_df.pivot(index='userId', columns='movieId', values='rating')
user_rating_df.head()

In [None]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from gensim.models import KeyedVectors
import re
import gensim

In [None]:
description_df = pd.read_csv('description.csv')
meta_df = pd.read_csv('meta-data.csv')
description_df = pd.merge(description_df, meta_df, on="movieId", how="inner")
description_df = pd.merge(description_df, movies_df, on='movieId', how='inner')
description_df['content'] = description_df['title'].str.replace(r'\(\d+\)', '') + ' ' + description_df['description'] + ' ' + description_df['meta-data']
description_df = description_df.drop(['description', 'meta-data'], axis=1)
description_df = description_df.dropna()
description_df.head()

In [None]:
def _removeNonAscii(s):
    return "".join(i.encode('ascii', 'ignore').decode() for i in s )

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [None]:
description_df['cleaned'] = description_df['content'].apply(_removeNonAscii)

description_df['cleaned'] = description_df.cleaned.apply(func = make_lower_case)
description_df['cleaned'] = description_df.cleaned.apply(func = remove_stop_words)
description_df['cleaned'] = description_df.cleaned.apply(func=remove_punctuation)
description_df['cleaned'] = description_df.cleaned.apply(func=remove_html)
description_df = description_df.drop(['content'], axis=1)


In [None]:
corpus = []
for words in description_df['cleaned']:
    corpus.append(words.split())

In [None]:
from gensim.models import KeyedVectors

In [None]:
# Load binary Word2Vec model
# pretrained_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


In [None]:
from gensim.models import Word2Vec

# pretrained_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


# Create a new Word2Vec model
model = Word2Vec(min_count=3,vector_size=750, workers=4,sg=1)

# Build the vocabulary from your corpus
model.build_vocab(corpus)

# Initialize word vectors with the pre-trained model's word vectors
# model.wv.vectors = pretrained_model.vectors

# Train the model with your corpus
model.train(corpus, total_examples=model.corpus_count, epochs=50)


In [None]:
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(r, k) / dcg_max

In [None]:
def precision_recall_at_k(recommended_items, relevant_items, k):

    
    recommended_items = recommended_items.iloc[:k]
    
    # Calculate precision@k
    precision = len(set(recommended_items).intersection(set(relevant_items))) / float(k)

    
    return precision

In [None]:
avg_ratings = ratings_df.groupby('movieId')['rating'].mean()
avg_ratings = avg_ratings.rename('avg_rating')

In [None]:
movies_df = pd.merge(movies_df, avg_ratings, on='movieId', how='inner')

In [None]:
relevant_item = movies_df[movies_df['avg_rating']>=3]

In [None]:
users =  pd.read_csv('./ml-1m/users.dat', sep='::', header=None, engine='python',encoding='latin-1')
users.columns = ['userId', 'gender', 'age', 'occupation', 'zipcode']
count_by_user = ratings_df.groupby('userId').size().sort_values(ascending=True)
count_by_user=count_by_user.head(10)
count_by_user


In [None]:
user_ids = users['userId'].values

# user_ids = count_by_user.index.values
user_ids

In [None]:
def recommendation(mock_user_id,trXTest):
    #Selecting the input user
    inputUser = trXTest[mock_user_id-1].reshape(1, -1)

    #Feeding in the user and reconstructing the input
    hh0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb)
    vv1 = tf.nn.sigmoid(tf.matmul(hh0, tf.transpose(W)) + vb)
    feed = sess.run(hh0, feed_dict={ v0: inputUser, W: prv_w, hb: prv_hb})
    rec = sess.run(vv1, feed_dict={ hh0: feed, W: prv_w, vb: prv_vb})

    scored_movies_df_mock = movies_df[movies_df['movieId'].isin(user_rating_df.columns)]
    scored_movies_df_mock = scored_movies_df_mock.assign(recommendationScore = rec[0])
    movies_df_mock = ratings_df[ratings_df['userId'] == mock_user_id]

    #Merging movies_df with ratings_df by movieId
    merged_df_mock = scored_movies_df_mock.merge(movies_df_mock, on='movieId', how='outer')

    merged_df_mock_has_watched = merged_df_mock.dropna()
    merged_df_mock_has_not_watched = merged_df_mock[merged_df_mock.isna().any(axis=1)]

    user_profile = merged_df_mock_has_watched[merged_df_mock_has_watched['rating'] >= 3]
    user_profile = user_profile.sort_values(by='rating', ascending=False).head(20)
    user_profile = user_profile.drop(['title','recommendationScore', 'userId', 'rating','timestamp','avg_rating'],axis=1)
    user_profile.columns = ['movieId','genres']
    user_profile_desc = pd.merge(user_profile, description_df, on='movieId')

    user_profile_genres = user_profile['genres'].unique()

    user_pref = ""
    for i,data in user_profile_desc.iterrows():
        user_pref += " " + data['cleaned']

    corpus_user_pref = []
    corpus_user_pref.append(user_pref.split())
    precision=0
    recall=0
    ndcg=0
    if len(corpus_user_pref[0]) > 0:
        merged_df_mock.columns = ['movieId','title','genres','avg_rating','recommendationScore','userId','rating','timestamp']
        merged_df_mock = merged_df_mock.drop(['genres','rating','userId','timestamp'], axis=1)
        # print(merged_df_mock)
        merged_df_mock_desc = pd.merge(merged_df_mock,description_df,on='movieId',how='inner')
      
        merged_df_mock_desc['Similarity Score'] = merged_df_mock_desc.cleaned.apply(lambda x: model.wv.n_similarity(corpus_user_pref[0],x.split()))
        
        relevance = pd.merge(relevant_item, movies_df_mock, on="movieId", how="inner")
        relevance = relevance.drop(['title','genres', 'userId','rating','timestamp'],axis=1)

        filtered_df = movies_df[movies_df['genres'].isin(user_profile_genres)]
        filtered_genres_df = filtered_df.drop(['title', 'genres'],axis=1)

        relevance = pd.concat([relevance,filtered_genres_df ])

        recommendationRBM = merged_df_mock_desc.sort_values(by='recommendationScore',ascending=False)
        recommendationW2V = merged_df_mock_desc.sort_values(by='Similarity Score',ascending=False)

        recommendation_finalRBMW2V =  merged_df_mock_desc.sort_values(by='recommendationScore',ascending=False).head(1000)
        recommendation_finalRBMW2V = recommendation_finalRBMW2V.sort_values(by='Similarity Score',ascending=False)

        recommendation_finalW2VRbm = merged_df_mock_desc.sort_values(by='Similarity Score',ascending=False).head(1000)
        recommendation_finalW2VRbm = recommendation_finalW2VRbm.sort_values(by='recommendationScore',ascending=False)
        
        k = 20

        precisionRBMW2V =  precision_recall_at_k(recommendation_finalRBMW2V['movieId'],relevance['movieId'], k)
        precisionW2VRBM = precision_recall_at_k(recommendation_finalW2VRbm['movieId'],relevance['movieId'], k)
        precisionRBM = precision_recall_at_k(recommendationRBM['movieId'],relevance['movieId'], k)
        precisionW2V = precision_recall_at_k(recommendationW2V['movieId'],relevance['movieId'], k)

        # ndcg hybrid
        item_ratings1 = recommendation_finalRBMW2V['avg_rating'] # item ratings in the ranked order
        ndcgRBMW2V = ndcg_at_k(item_ratings1, k) 

        item_ratings2 = recommendation_finalW2VRbm['avg_rating'] # item ratings in the ranked order
        ndcgW2VRBM = ndcg_at_k(item_ratings2, k) 

        item_rating3 = recommendationRBM['avg_rating']
        ndcgRBM =ndcg_at_k(item_rating3, k) 

        item_rating4 = recommendationW2V['avg_rating']
        ndcgW2V =ndcg_at_k(item_rating4, k) 
    # return  precisionRBMW2V, ndcgRBMW2V,
    return precisionRBMW2V,ndcgRBMW2V, precisionW2VRBM, ndcgW2VRBM, precisionRBM,ndcgRBM,precisionW2V,ndcgW2V

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.model_selection import KFold
# Initialize the KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [None]:
import mysql.connector
# Replace the placeholders with your MySQL database details
conn = mysql.connector.connect(
    host='localhost',
    user='root',
    password='my-secret-pw',
    database='tsp'
)

cursor = conn.cursor()

In [None]:
user_ids = user_ids[:3200]
user_ids

In [None]:
pRW = []
nRW = []
pWR = []
nWR = []
pW = []
nW = []
pR = []
nR = []
user_rating_df = train_set.pivot(index='userId', columns='movieId', values='rating')


norm_user_rating_df = user_rating_df.fillna(0) / 5.0
trX = norm_user_rating_df.values

hiddenUnits = 400
visibleUnits =  len(user_rating_df.columns)
vb = tf.placeholder("float", [visibleUnits]) #Number of unique movies
hb = tf.placeholder("float", [hiddenUnits]) #Number of features we're going to learn
W = tf.placeholder("float", [visibleUnits, hiddenUnits])

#Phase 1: Input Processing
v0 = tf.placeholder("float", [None, visibleUnits])
_h0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb)
h0 = tf.nn.relu(tf.sign(_h0 - tf.random_uniform(tf.shape(_h0))))
#Phase 2: Reconstruction
_v1 = tf.nn.sigmoid(tf.matmul(h0, tf.transpose(W)) + vb) 
v1 = tf.nn.relu(tf.sign(_v1 - tf.random_uniform(tf.shape(_v1))))
h1 = tf.nn.sigmoid(tf.matmul(v1, W) + hb)   


#Learning rate
alpha = 1.0
#Create the gradients
w_pos_grad = tf.matmul(tf.transpose(v0), h0)
w_neg_grad = tf.matmul(tf.transpose(v1), h1)
#Calculate the Contrastive Divergence to maximize
CD = (w_pos_grad - w_neg_grad) / tf.to_float(tf.shape(v0)[0])
#Create methods to update the weights and biases
update_w = W + alpha * CD
update_vb = vb + alpha * tf.reduce_mean(v0 - v1, 0)
update_hb = hb + alpha * tf.reduce_mean(h0 - h1, 0)

err = v0 - v1
err_sum = tf.reduce_mean(err * err)

#Current weight
cur_w = np.zeros([visibleUnits, hiddenUnits], np.float32)
#Current visible unit biases
cur_vb = np.zeros([visibleUnits], np.float32)
#Current hidden unit biases
cur_hb = np.zeros([hiddenUnits], np.float32)
#Previous weight
prv_w = np.zeros([visibleUnits, hiddenUnits], np.float32)
#Previous visible unit biases
prv_vb = np.zeros([visibleUnits], np.float32)
#Previous hidden unit biases
prv_hb = np.zeros([hiddenUnits], np.float32)
sess = tf.Session()
sess.run(tf.global_variables_initializer())


epochs = 20
batchsize = 100
errors = []
for i in range(epochs):
    for start, end in zip( range(0, len(trX), batchsize), range(batchsize, len(trX), batchsize)):
        batch = trX[start:end]
        cur_w = sess.run(update_w, feed_dict={v0: batch, W: prv_w, vb: prv_vb, hb: prv_hb})
        cur_vb = sess.run(update_vb, feed_dict={v0: batch, W: prv_w, vb: prv_vb, hb: prv_hb})
        cur_nb = sess.run(update_hb, feed_dict={v0: batch, W: prv_w, vb: prv_vb, hb: prv_hb})
        prv_w = cur_w
        prv_vb = cur_vb
        prv_hb = cur_hb
    errors.append(sess.run(err_sum, feed_dict={v0: trX, W: cur_w, vb: cur_vb, hb: cur_hb}))
    print (errors[-1])


    #Loading in the ratings dataset



user_rating_test_df = test_set.pivot(index='userId', columns='movieId', values='rating')

norm_user_rating_test_df = user_rating_df.fillna(0) / 5.0
trXTest = norm_user_rating_df.values
i = 1
for user in user_ids:
    precisionRBMW2V,ndcgRBMW2V, precisionW2VRBM, ndcgW2VRBM, precisionRBM,ndcgRBM,precisionW2V,ndcgW2V = recommendation(user,trXTest)
    # precisionW2VRBM, ndcgW2VRBM = recommendation(user,trXTest)
    pRW.append(precisionRBMW2V)
    nRW.append(ndcgRBMW2V)
    pWR.append(precisionW2VRBM)
    nWR.append(ndcgW2VRBM)
    pR.append(precisionRBM)
    nR.append(ndcgRBM)
    pW.append(precisionW2V)
    nW.append(ndcgW2V)
    print(user)
    sql = "INSERT INTO metrics_4 (userid, precisionRW,ndcgRW,precisionWR,ndcgWR,precisionR,ndcgR,precisionW,ndcgW) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    values = (int(user), float(precisionRBMW2V), float(ndcgRBMW2V),float(precisionW2VRBM),float(ndcgW2VRBM),float(precisionRBM),float(ndcgRBM),float(precisionW2V), float(ndcgW2V))
    cursor.execute(sql, values)
    conn.commit()
    print("RW ", np.mean(pRW), np.mean(nRW))
    print("WR ", np.mean(pWR), np.mean(nWR))
    print("W ", np.mean(pW), np.mean(nW))
    print("R ", np.mean(pR),np.mean(nR))
    print("----")

