In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from keras import layers
from tensorflow_similarity.layers import MetricEmbedding
from tensorflow_similarity.losses import MultiSimilarityLoss
from tensorflow_similarity.models import SimilarityModel
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
movie_df = pd.read_csv("data/movie.csv")
movie_df

In [None]:
rating_df = pd.read_csv("data/rating.csv")
rating_df = rating_df.drop("timestamp", axis=1)
rating_df

In [None]:
num_of_movie = len(movie_df["title"].unique())
num_of_user = len(rating_df["userId"].unique())
num_of_rating = len(rating_df["movieId"].unique())

print(f"Num of movies: {num_of_movie}")
print(f"Num of ratings given by users: {num_of_user}")
print(f"Num of movie ratings: {num_of_rating}")

In [None]:
movie_df.info()

In [None]:
rating_df.info()

In [None]:
movie_list = movie_df["title"].value_counts().keys()
count = movie_df["title"].value_counts()

movie_count = pd.DataFrame({"Movie-Title": movie_list, "Count": count}).reset_index(drop=True)
movie_count

In [None]:
rating_list = rating_df["rating"].value_counts().keys()
count = rating_df["rating"].value_counts()

rating_count = pd.DataFrame({"Ratings": rating_list, "Count": count}).reset_index(drop=True)
rating_count

In [None]:
sns.barplot(data=rating_count, x='Ratings', y='Count')
plt.show()

In [None]:
movie_df.isnull().sum()

In [None]:
rating_df.isnull().sum()

In [None]:
movie_df.duplicated().sum()

In [None]:
rating_df.duplicated().sum()

In [None]:
movie_df = movie_df[movie_df["genres"] != "(no genres listed)"]
movie_df

In [None]:
rating_df = rating_df[rating_df["movieId"].isin(movie_df["movieId"])]
rating_df

In [None]:
new_df = movie_df.merge(rating_df)
new_df = new_df.drop("title", axis=1)
new_df

In [None]:
new_df.duplicated().sum()

In [None]:
new_df.isnull().sum()

In [None]:
tfid = TfidfVectorizer()
tfid.fit(new_df["genres"])

tfid.get_feature_names()

In [None]:
tfidf_matrix = tfid.fit_transform(new_df["genres"])
matrix_dim = tfidf_matrix.shape[1]
matrix_dim

In [None]:
tfidf_matrix.todense()