In [100]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

def tokenize_input(user_input):
    user_input = RegexpTokenizer(r'[a-zA-z]+').tokenize(user_input)
    user_input = [t.lower() for t in user_input]
    user_input = [t for t in user_input if not t in list(string.punctuation)]
    user_input = [t for t in user_input if not t in stopwords.words("english")]
    user_input = [nltk.stem.WordNetLemmatizer().lemmatize(word) for word in user_input]
    return user_input

movies_df = pd.read_csv('../data/test.csv')

# Create a bag-of-words representation for each movie description
vectorizer = CountVectorizer()
movie_bow = vectorizer.fit_transform(movies_df['Description Tokenized'])

# Get user input description
user_input = "In this thrilling fantasy film, a young wizard competes in a dangerous tournament between rival schools of magic. Along with his friends, he navigates challenging tasks and uncovers a dark conspiracy that threatens the entire wizarding world. With impressive special effects and a talented cast, this captivating adventure is full of action, humor, and heart."

tokenized_user_input = tokenize_input(user_input)

# Create a bag-of-words representation for the user input description
user_input_bow = vectorizer.transform(tokenized_user_input)

# Calculate similarity between user input description and each movie description
similarity_scores = cosine_similarity(user_input_bow, movie_bow)


# Add a new column to movies_df with the similarity scores
movies_df = movies_df.assign(similarity=similarity_scores[0])

# Sort movies based on similarity score
ranked_movies = movies_df.sort_values('similarity', ascending=False)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/xaviersantos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xaviersantos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/xaviersantos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xaviersantos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [101]:
ranked_movies

Unnamed: 0,Name,Date Published,Description,Rating,Rating Count,Content Rating,Action,Adult,Adventure,Animation,...,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Description Tokenized,similarity
2722,Kings of the Road: The Story of the Portland B...,14/10/2010,The thrilling story of the original Western Ho...,,,Not Rated,0,0,0,0,...,0,0,0,1,0,0,0,0,"['thrilling', 'story', 'original', 'western', ...",0.218218
6011,Wyatt Earp,28/10/1994,"From Wichita to Dodge City, to the O.K. Corral...",6.7,52537.0,M/12,1,0,1,0,...,0,0,0,0,0,0,0,0,"['wichita', 'dodge', 'city', 'k', 'corral', 't...",0.204124
2868,Grand Theft Auto V,17/09/2013,Three very different criminals team up for a s...,9.5,64276.0,M,1,0,0,0,...,0,0,0,0,0,0,0,0,"['three', 'different', 'criminal', 'team', 'se...",0.200000
3094,Zan xian sheng yu zhao qian Hua,08/12/1982,Beloved for its combination of thrilling comba...,7.2,1434.0,R,1,0,0,0,...,0,0,0,0,0,0,0,0,"['beloved', 'combination', 'thrilling', 'comba...",0.196116
7823,Adventures of Sherlock Holmes,07/10/1905,The millionaire&apos;s child is kidnapped. She...,4.6,51.0,,0,0,0,0,...,0,0,1,0,0,0,0,0,"['millionaire', 'apos', 'child', 'kidnapped', ...",0.152499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2736,Obsession,27/09/2019,A farmhand begins an affair with his elderly b...,5.1,804.0,TV-MA,0,0,0,0,...,0,0,0,0,0,1,0,0,"['farmhand', 'begin', 'affair', 'elderly', 'bo...",0.000000
2735,"No Deposit, No Return",,An entrepreneur lets one of his friends talk h...,5.7,24.0,,1,0,0,0,...,0,0,0,0,0,0,0,0,"['entrepreneur', 'let', 'one', 'friend', 'talk...",0.000000
2734,Nightmare in Williamson County,,"Williamson County, Tennessee (home of such leg...",,,,0,0,0,0,...,0,0,0,0,0,0,0,0,"['williamson', 'county', 'tennessee', 'home', ...",0.000000
2733,Nickelodeon,21/12/1976,Buck and lawyer Leo accidentally get into movi...,6.2,2518.0,M/6,0,0,0,0,...,0,0,0,0,0,0,0,0,"['buck', 'lawyer', 'leo', 'accidentally', 'get...",0.000000


In [102]:
# TF-IDF similarity
vectorizer = TfidfVectorizer()
tfidf_bow = vectorizer.fit_transform(movies_df['Description Tokenized'])
user_input_tfidf_bow = vectorizer.transform(tokenized_user_input)
tfidf_scores = cosine_similarity(user_input_tfidf_bow, tfidf_bow)

# Sort movies based on similarity score
ranked_movies = movies_df.sort_values('similarity', ascending=False)


In [103]:
ranked_movies

Unnamed: 0,Name,Date Published,Description,Rating,Rating Count,Content Rating,Action,Adult,Adventure,Animation,...,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Description Tokenized,similarity
2722,Kings of the Road: The Story of the Portland B...,14/10/2010,The thrilling story of the original Western Ho...,,,Not Rated,0,0,0,0,...,0,0,0,1,0,0,0,0,"['thrilling', 'story', 'original', 'western', ...",0.218218
6011,Wyatt Earp,28/10/1994,"From Wichita to Dodge City, to the O.K. Corral...",6.7,52537.0,M/12,1,0,1,0,...,0,0,0,0,0,0,0,0,"['wichita', 'dodge', 'city', 'k', 'corral', 't...",0.204124
2868,Grand Theft Auto V,17/09/2013,Three very different criminals team up for a s...,9.5,64276.0,M,1,0,0,0,...,0,0,0,0,0,0,0,0,"['three', 'different', 'criminal', 'team', 'se...",0.200000
3094,Zan xian sheng yu zhao qian Hua,08/12/1982,Beloved for its combination of thrilling comba...,7.2,1434.0,R,1,0,0,0,...,0,0,0,0,0,0,0,0,"['beloved', 'combination', 'thrilling', 'comba...",0.196116
7823,Adventures of Sherlock Holmes,07/10/1905,The millionaire&apos;s child is kidnapped. She...,4.6,51.0,,0,0,0,0,...,0,0,1,0,0,0,0,0,"['millionaire', 'apos', 'child', 'kidnapped', ...",0.152499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2736,Obsession,27/09/2019,A farmhand begins an affair with his elderly b...,5.1,804.0,TV-MA,0,0,0,0,...,0,0,0,0,0,1,0,0,"['farmhand', 'begin', 'affair', 'elderly', 'bo...",0.000000
2735,"No Deposit, No Return",,An entrepreneur lets one of his friends talk h...,5.7,24.0,,1,0,0,0,...,0,0,0,0,0,0,0,0,"['entrepreneur', 'let', 'one', 'friend', 'talk...",0.000000
2734,Nightmare in Williamson County,,"Williamson County, Tennessee (home of such leg...",,,,0,0,0,0,...,0,0,0,0,0,0,0,0,"['williamson', 'county', 'tennessee', 'home', ...",0.000000
2733,Nickelodeon,21/12/1976,Buck and lawyer Leo accidentally get into movi...,6.2,2518.0,M/6,0,0,0,0,...,0,0,0,0,0,0,0,0,"['buck', 'lawyer', 'leo', 'accidentally', 'get...",0.000000


In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rake_nltk import Rake

# define the RAKE object
r = Rake()

# define the vectorizer
vectorizer = TfidfVectorizer()

# fit the vectorizer to your movie descriptions
tfidf_matrix = vectorizer.fit_transform(movies_df['Description Tokenized'])

# define a function to get the most similar movies
def get_similar_movies(user_input, n=10):
    keywords = r.extract_keywords_from_text(user_input)
    user_input_keywords = r.get_ranked_phrases()
    user_input_tfidf = vectorizer.transform(user_input_keywords)
    return cosine_similarity(user_input_tfidf, tfidf_matrix)
    

user_input = "In this thrilling fantasy film, a young wizard competes in a dangerous tournament between rival schools of magic. Along with his friends, he navigates challenging tasks and uncovers a dark conspiracy that threatens the entire wizarding world. With impressive special effects and a talented cast, this captivating adventure is full of action, humor, and heart."

cosine_similarities = get_similar_movies(user_input)


# Add a new column to movies_df with the similarity scores
movies_df = movies_df.assign(similarity=cosine_similarities[0])

# Sort movies based on similarity score
movies_df.sort_values('similarity', ascending=False)

Unnamed: 0,Name,Date Published,Description,Rating,Rating Count,Content Rating,Action,Adult,Adventure,Animation,...,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Description Tokenized,similarity
4536,My Magical Demon Lover,14/05/2018,Tristan dreamed of being a wizard his entire l...,,,,0,0,1,0,...,0,0,0,0,0,0,0,0,"['tristan', 'dreamed', 'wizard', 'entire', 'li...",0.295848
502,The Sword in the Stone,05/11/1969,A poor boy named Arthur learns the power of lo...,7.1,100774.0,M/6,0,0,1,1,...,0,0,0,0,0,0,0,0,"['poor', 'boy', 'named', 'arthur', 'learns', '...",0.266616
2112,Harry Potter and the Goblet of Fire,24/11/2005,Harry Potter finds himself competing in a haza...,7.7,638825.0,M/12,0,0,1,0,...,0,0,0,0,0,0,0,0,"['harry', 'potter', 'find', 'competing', 'haza...",0.259601
6603,Harry Potter and the Prisoner of Azkaban,29/07/2004,"Harry Potter, Ron and Hermione return to Hogwa...",7.9,647147.0,M/6,0,0,1,0,...,0,0,0,0,0,0,0,0,"['harry', 'potter', 'ron', 'hermione', 'return...",0.250263
2261,Wicked: Part One,25/12/2024,The story of how a green-skinned woman framed ...,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,"['story', 'green', 'skinned', 'woman', 'framed...",0.206121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928,Empire of the Ants,09/07/1982,Con artist Marilyn Fryser tries to sell bogus ...,4.2,5059.0,PG,0,0,1,0,...,0,1,0,0,0,0,0,0,"['con', 'artist', 'marilyn', 'fryser', 'try', ...",0.000000
2926,Greased Lightning,29/07/1928,"Diana Standish (Betty Caldwell), an eastern gi...",,,Passed,0,0,0,0,...,0,0,0,0,0,0,0,1,"['diana', 'standish', 'betty', 'caldwell', 'ea...",0.000000
2925,2001: A Space Odyssey,01/10/1968,After uncovering a mysterious artifact buried ...,8.3,677182.0,M/12,0,0,1,0,...,0,1,0,0,0,0,0,0,"['uncovering', 'mysterious', 'artifact', 'buri...",0.000000
2924,For the Love of Benji,10/06/1977,"Benji sniffs out a bogus CIA agent in Athens, ...",5.9,1178.0,G,0,0,1,0,...,0,0,0,0,0,0,0,0,"['benji', 'sniff', 'bogus', 'cia', 'agent', 'a...",0.000000
