In [3]:
path  = "C:/Users/yis82/OneDrive/Desktop/Lunch Lab/data/data/demo-data"

In [5]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Import data from the clean file
df = pd.read_csv(path+"/metadata_clean.csv")
orig_df = pd.read_csv(path+"/movies_metadata.csv",low_memory=False)
df["overview"], df["id"] = orig_df["overview"], orig_df["id"]
df=df[:4000]

In [6]:
from openai import OpenAI
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os

load_dotenv()
# load_dotenv() is a function that loads variables from a .env file into environment variables in a Python script.
# We store OPENAI_API_KEY=  xxx in .env file
openai_api_key = os.getenv("OpenAI_API_KEY")

client = OpenAI(api_key = openai_api_key)

MODEL_NAME = "text-embedding-ada-002"


def get_embedding(text, model=MODEL_NAME):
    if not isinstance(text, str):
        text = str(text)

    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


def search_docs(df, user_query, threshold=0.8):
    embedding = get_embedding(user_query, model=MODEL_NAME)
    df["similarities"] = df.embedding.apply(lambda x: cosine_similarity(x, embedding))

    # Filter results based on the threshold
    filtered_results = df[df["similarities"] > threshold]

    return filtered_results


df["embedding"] = df["overview"].apply(lambda x: get_embedding(x, model=MODEL_NAME))

title = "Toy Story"
description = df.loc[df["title"] == title, "overview"].iloc[0]
result = search_docs(df, description, threshold=0.8)

# remove the search item
print(result[result["title"] != title]["title"])

1                                               Jumanji
59                           The Indian in the Cupboard
124     The Neverending Story III: Escape from Fantasia
143                         The Amazing Panda Adventure
359                                       The Lion King
387                  The Secret Adventures of Tom Thumb
455                               Getting Even with Dad
485                                              Malice
545                                           Threesome
589                                           Pinocchio
804                         The Adventures of Pinocchio
858                                               Bogus
994                Winnie the Pooh and the Blustery Day
995                                The Three Caballeros
1071                              Rebel Without a Cause
1154                            The Empire Strikes Back
1155                                 The Princess Bride
1208                      The Day the Earth Stoo

In [7]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')
#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')
#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
# Obtain the index of the movie that matches the title
  idx = indices[title]
  # Get the pairwsie similarity scores of all movies with that movie
  # And convert it into a list of tuples as described above
  sim_scores = list(enumerate(cosine_sim[idx]))
  # Sort the movies based on the cosine similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  # Get the scores of the 10 most similar movies. Ignore the first movie.
  sim_scores = sim_scores[1:11]
  # Get the movie indices
  movie_indices = [i[0] for i in sim_scores]
  # Return the top 10 most similar movies
  return df['title'].iloc[movie_indices]

#Get recommendations
content_recommender('The Shawshank Redemption')

1231       Cool Hand Luke
499             No Escape
1564    Alive and Kicking
3527      American Gigolo
2838         Penitentiary
2689    A Christmas Story
1686    A Further Gesture
2984     Fatal Attraction
454           The Getaway
2765      Double Jeopardy
Name: title, dtype: object

In [12]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate
import os

# path to dataset file
file_path = os.path.expanduser(path + "/ml-1m/ratings.dat")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '::' characters.
columns = ["user_id", "item_id", "rating", "timestamp"]
reader = Reader(line_format="user item rating timestamp", sep="::")

data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

sim_options = {"name": "cosine", "user_based": True}
knn_model = KNNBasic(sim_options=sim_options)

knn_model.fit(trainset)

user_id = str(196)  # Replace with the desired user ID

# Get items that the user has not rated
items_to_predict = [
    (user_id, iid, 4.0)
    for iid in trainset.all_items()
    if iid not in trainset.ur[trainset.to_inner_uid(user_id)]
]

# Get top N recommendations for the user
top_n = knn_model.test(items_to_predict)[0:11]

# Display the top N recommendations
for uid, iid, true_r, est, _ in top_n:
    print(f"User {uid} -> Item {iid} (Predicted rating: {est:.2f})")

Computing the cosine similarity matrix...
Done computing similarity matrix.
User 196 -> Item 0 (Predicted rating: 3.58)
User 196 -> Item 1 (Predicted rating: 3.58)
User 196 -> Item 2 (Predicted rating: 3.58)
User 196 -> Item 3 (Predicted rating: 3.58)
User 196 -> Item 4 (Predicted rating: 3.58)
User 196 -> Item 5 (Predicted rating: 3.58)
User 196 -> Item 6 (Predicted rating: 3.58)
User 196 -> Item 7 (Predicted rating: 3.58)
User 196 -> Item 8 (Predicted rating: 3.58)
User 196 -> Item 9 (Predicted rating: 3.58)
User 196 -> Item 10 (Predicted rating: 3.58)


In [14]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# path to dataset file
file_path = os.path.expanduser(path+ "/ml-1m/ratings.dat")

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '::' characters.
columns = ["user_id", "item_id", "rating", "timestamp"]
reader = Reader(line_format="user item rating timestamp", sep="::")

data = Dataset.load_from_file(file_path, reader=reader)
full_data = data.build_full_trainset()

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

# Use SVD for item-based collaborative filtering
svd_model = SVD()  # Set user_based to False for item-based collaborative filtering

# Train the model on the training set
svd_model.fit(train_set)

# Make predictions on the test set
predictions = svd_model.test(test_set)

# Evaluate the model using RMSE
accuracy = rmse(predictions)
print(f"RMSE on the test set: {accuracy:.4f}")

from collections import defaultdict


def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n


top_n = get_top_n(predictions, n=10)
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

RMSE: 0.8746
RMSE on the test set: 0.8746
1841 ['318', '47', '1247', '1584', '2501', '508', '356', '1784', '34', '296']
3715 ['3793', '2716', '780', '1274', '1073', '3114', '1', '588', '610', '2987']
2002 ['3307', '903', '2203', '1278', '930', '1267', '1269', '951', '955', '3801']
3332 ['2692', '223', '2890', '1198', '2706', '3160', '2693', '1961', '1127', '3114']
3576 ['1278', '553', '2000', '3801', '356', '920', '943', '1610', '593', '1247']
2092 ['1193', '922', '1199', '1203', '1234', '1296', '670', '908', '668', '1224']
5283 ['2997', '1148', '1210', '1921', '29', '3033', '3039', '3481', '3397', '1036']
4610 ['745', '1136', '2918', '3000', '1278', '1223', '3039', '1580', '2344', '3745']
398 ['608', '111', '50', '2064', '3730', '293', '1214', '3317', '2391', '924']
4533 ['1089', '2599', '508', '2997', '3178', '3408', '3534', '1097', '357']
76 ['1148', '50', '2959', '2762', '2289', '2336', '1', '2542', '2692', '3897']
921 ['2571', '593', '1240', '1302', '1291', '1231', '678', '2858', 

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Load the MovieLens dataset (download it from https://grouplens.org/datasets/movielens/)
file_path =path+ "/ml-1m/ratings.dat"
columns = ["user_id", "item_id", "rating", "timestamp"]
df = pd.read_csv(file_path, sep="\t", names=columns)

# Create user-item interaction matrix
user_item_matrix = (
    df.pivot(index="user_id", columns="item_id", values="rating").fillna(0).values
)

# Split the data into training and testing sets
train_data, test_data = train_test_split(
    user_item_matrix, test_size=0.2, random_state=42
)

# Build the autoencoder model
num_users, num_items = user_item_matrix.shape
latent_dim = 50

input_layer = Input(shape=(num_items,))
encoded = Dense(latent_dim, activation="relu")(input_layer)
decoded = Dense(num_items, activation="sigmoid")(encoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer="adam", loss="mean_squared_error")

# Train the autoencoder
autoencoder.fit(
    train_data,
    train_data,
    epochs=10,
    batch_size=64,
    shuffle=True,
    validation_data=(test_data, test_data),
)

# Extract user and item representations from the encoder part of the autoencoder
encoder = Model(inputs=input_layer, outputs=encoded)
user_embeddings = encoder.predict(user_item_matrix)

# Example: Recommend items for a specific user
user_id = 1  # Replace with the desired user ID
user_representation = user_embeddings[user_id - 1]

# Calculate the predicted ratings for all items
predicted_ratings = np.dot(user_embeddings, user_representation)

# Display top N recommendations
top_n = np.argsort(predicted_ratings)[::-1][:10]
print(f"Top recommendations for User {user_id}: {top_n + 1}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Top recommendations for User 1: [1000209  333412  333410  333409  333408  333407  333406  333405  333404
  333403]


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load the MovieLens dataset (download it from https://grouplens.org/datasets/movielens/)
file_path =path+ "/ml-1m/ratings.dat"
columns = ["user_id", "item_id", "rating", "timestamp"]
df = pd.read_csv(file_path, sep="\t", names=columns)

# Create user-item interaction matrix
user_item_matrix = (
    df.pivot(index="user_id", columns="item_id", values="rating").fillna(0).values
)

# Binarize the ratings (0 if not rated, 1 if rated)
user_item_matrix_binary = (user_item_matrix > 0).astype(float)

# Split the data into training and testing sets
train_data, test_data = train_test_split(
    user_item_matrix_binary, test_size=0.2, random_state=42
)

# RBM parameters
num_visible = num_items = user_item_matrix_binary.shape[1]
num_hidden = 50
batch_size = 64
epochs = 10

# Build the RBM model
visible_layer = Input(shape=(num_visible,))
hidden_layer = Dense(num_hidden, activation="sigmoid")(visible_layer)
visible_layer_reconstruction = Dense(num_visible, activation="sigmoid")(hidden_layer)

rbm = Model(inputs=visible_layer, outputs=visible_layer_reconstruction)
rbm.compile(optimizer=Adam(learning_rate=0.001), loss="mean_squared_error")

# Train the RBM
rbm.fit(
    train_data,
    train_data,
    epochs=epochs,
    batch_size=batch_size,
    shuffle=True,
    validation_data=(test_data, test_data),
)

# Extract user and item representations from the hidden layer
user_embeddings = rbm.layers[1].get_weights()[0].T
item_embeddings = rbm.layers[1].get_weights()[0]

# Example: Recommend items for a specific user
user_id = 1  # Replace with the desired user ID
user_representation = user_embeddings[user_id - 1]

# Calculate the predicted ratings for all items
predicted_ratings = np.dot(user_embeddings, user_representation)

# Display top N recommendations
top_n = np.argsort(predicted_ratings)[::-1][:10]
print(f"Top recommendations for User {user_id}: {top_n + 1}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Top recommendations for User 1: [ 1 19 44 11 22  7 42 32 26 50]
