In [9]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import gensim.downloader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

In [2]:
df1 = pd.read_csv('links.csv', usecols=['tmdbId'])
df1 = df1.dropna()
df1 = df1.convert_dtypes()

In [3]:
df1 = df1.assign(title = pd.NA, overview = pd.NA)
df1 = df1.assign(overview = pd.NA)

In [None]:
for index, row in tqdm(df1.iterrows()):

    headers = {
        "accept": "application/json",
        "Authorization": "INSERT YOUR KEY"
    }

    url = "https://api.themoviedb.org/3/movie/" + str(row['tmdbId'])

    response = requests.get(url, headers = headers)
    result = response.json()
    try:
        df1.loc[index, 'title'] = result['original_title']
        df1.loc[index, 'overview'] = result['overview']
    except:
        continue

df1 = df1.dropna()

In [None]:
stop_words = set(stopwords.words('english'))

def clean(text):
    # Remove HTML tags and patterns
    clean_text = BeautifulSoup(text, "html.parser").get_text()

    # Convert text to lowercase
    clean_text = clean_text.lower()

    # Tokenize the text and remove stopwords
    words = nltk.word_tokenize(clean_text)
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    words = [WordNetLemmatizer.lemmatize(word) for word in words]

    # Remove punctuation and non-alphanumeric characters
    words = [word for word in words if re.match(r'^[a-zA-Z0-9]+$', word)]

    # Join the words back into a cleaned text
    cleaned_text = ' '.join(words)

    return cleaned_text

In [None]:
df1['clean_overview'] = df1['overview'].apply(clean)

In [None]:
df1 = df1.drop(['overview'], axis=1)

In [None]:
df1.to_csv("movie_data.csv", index=False)

In [11]:
movie_df  = pd.read_csv("movie_data.csv").dropna()

In [None]:
corpus = []
for desc in tqdm(movie_df['clean_overview']):
    corpus.append(desc.split())

In [None]:
pretrained_model = gensim.downloader.load('word2vec-google-news-300')

In [14]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df = 2, stop_words='english')
tfidf_vectors = tfidf.fit_transform(movie_df['clean_overview'])
tfidf_feature = tfidf.get_feature_names_out()

In [15]:
vectors_df = pd.DataFrame(tfidf_vectors.toarray(), columns = tfidf_feature)

In [None]:
doc_vectors = []
for index, desc in tqdm(enumerate(corpus)):
    weighted_word_vector = np.zeros(300)
    weighted_sum = 0
    for word in desc:
        if word in tfidf_feature and word in pretrained_model:
            weighted_word_vector += pretrained_model[word] * vectors_df.loc[index, word]
            weighted_sum += vectors_df.loc[index, word]
        if weighted_sum != 0:
            weighted_word_vector /= weighted_sum
    doc_vectors.append(weighted_word_vector)

In [None]:
cosine_similarities = cosine_similarity(doc_vectors, doc_vectors)

In [None]:
def recommendation(title):
    indices = pd.Series(movie_df.index, index = movie_df['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    # print(sim_scores)
    book_indices = [i[0] for i in sim_scores]
    recommend = movie_df.iloc[book_indices]["title"]
    return recommend

In [None]:
recommendation("Mission: Impossible")