In [2]:
import pandas as pd
import numpy as np


In [3]:
movies= pd.read_csv("C:\\Users\\ramva\\Downloads\\ml-latest-small\\ml-latest-small\\movies.csv")
ratings=pd.read_csv("C:\\Users\\ramva\\Downloads\\ml-latest-small\\ml-latest-small\\ratings.csv")
links = pd.read_csv("C:\\Users\\ramva\\Downloads\\ml-latest-small\\ml-latest-small\\links.csv")
tags = pd.read_csv("C:\\Users\\ramva\\Downloads\\ml-latest-small\\ml-latest-small\\tags.csv")

In [4]:
duplicate_countm = movies.duplicated().sum()
print("Number of duplicate rows:", duplicate_countm)

# Drop duplicates
movies.drop_duplicates(inplace=True)

Number of duplicate rows: 0


In [5]:
duplicate_countl = links.duplicated(subset=['movieId']).sum()
print("Number of duplicate rows based on 'movieId':", duplicate_countl)

# Drop duplicates based on 'movieid' column
links.drop_duplicates(subset=['movieId'], inplace=True)

Number of duplicate rows based on 'movieId': 0


In [6]:
missingr=ratings.isnull().sum()
print("Number of duplicate rows:", missingr)
# Handle missing values by dropping rows with missing data
ratings.dropna(inplace=True)

Number of duplicate rows: userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [7]:
tags=tags.isnull().sum()
print("Number of duplicate rows:", tags)
# Handle missing values by dropping rows with missing data
tags.dropna(inplace=True)

Number of duplicate rows: userId       0
movieId      0
tag          0
timestamp    0
dtype: int64


In [8]:
missingl=links.isnull().sum()
print("Number of duplicate rows:", missingl)
# Handle missing values by dropping rows with missing data
links.dropna(inplace=True)

Number of duplicate rows: movieId    0
imdbId     0
tmdbId     8
dtype: int64


In [9]:
missingm=movies.isnull().sum()
print("Number of duplicate rows:", missingm)
# Handle missing values by dropping rows with missing data
movies.dropna(inplace=True)

Number of duplicate rows: movieId    0
title      0
genres     0
dtype: int64


In [10]:
ratings = ratings[(ratings['rating'] >= 1) & (ratings['rating'] <= 5)]

In [11]:
valid_movieId= set(movies['movieId'])
ratings=ratings[ratings['movieId'].isin(valid_movieId)]

In [12]:
print("Summary after cleaning:")
print(ratings.describe())

Summary after cleaning:
             userId        movieId        rating     timestamp
count  99466.000000   99466.000000  99466.000000  9.946600e+04
mean     326.011743   19313.638178      3.542899  1.204280e+09
std      182.584785   35418.702903      0.987945  2.164819e+08
min        1.000000       1.000000      1.000000  8.281246e+08
25%      177.000000    1198.000000      3.000000  1.009694e+09
50%      325.000000    2985.000000      3.500000  1.182908e+09
75%      477.000000    7844.750000      4.000000  1.435891e+09
max      610.000000  193609.000000      5.000000  1.537799e+09


In [13]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [14]:
def preprocess_text(text):
    text = text.lower()
   
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [15]:
movies['title'] = movies['title'].apply(preprocess_text)
movies.to_csv("C:\\Users\\ramva\\Downloads\\ml-latest-small\\ml-latest-small\\movies.csv", index=False)
print("Preprocessing of movie title is done")

Preprocessing of movie title is done


In [16]:
from numpy.linalg import norm

In [17]:
#Extract unique movieId and userId from data
unique_movies = movies['movieId'].unique()
unique_users = ratings['userId'].unique()


In [18]:
user_movie_ratings = np.zeros((len(unique_movies), len(unique_users)))
movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(unique_movies)}

In [19]:
for row_index, row in ratings.iterrows():
    user_id = int(row['userId'])
    movie_id = row['movieId']
    if movie_id in movie_id_to_idx:
        movie_idx = movie_id_to_idx[movie_id]
        user_movie_ratings[movie_idx, user_id - 1] = row['rating']

In [20]:
def cosine_similarity(a, b):
    norm_a = norm(a)
    norm_b = norm(b)
    
    if norm_a == 0 or norm_b == 0:
        return 0.0
    
    return np.dot(a, b) / (norm_a * norm_b)

In [21]:
cosine_similarity_matrix = np.zeros((len(unique_movies), len(unique_movies)))

for i in range(len(unique_movies)):
    for j in range(i, len(unique_movies)):
        movie_i_ratings = user_movie_ratings[i]
        movie_j_ratings = user_movie_ratings[j]
        similarity = cosine_similarity(movie_i_ratings, movie_j_ratings)
        cosine_similarity_matrix[i, j] = similarity
        cosine_similarity_matrix[j, i] = similarity

In [22]:
def get_similar_movies(movie_name, n_recommendations):
    movie_indices = movies[movies['title'] == movie_name].index
    if not movie_indices.empty:
        movie_idx = movie_indices[0]
        similarities = cosine_similarity_matrix[movie_idx]
        similar_movie_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]

        recommended_movies = [movies['title'][idx] for idx in similar_movie_indices]
        return recommended_movies
    else:
        return []

In [23]:
user_movie_name = input("Enter a movie name:")
n = int(input("Enter desired no.of recommendations:"))
list = get_similar_movies(user_movie_name,n)
print("Recommended movies are:")
for a in list:
    print(a)

Enter a movie name:iron man 2008
Enter desired no.of recommendations:5
Recommended movies are:
dark knight 2008
walle 2008
avenger 2012
iron man 2 2010
avatar 2009
