In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
df = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin-1')

In [3]:
df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [4]:
# Remove leading/trailing spaces, extra spaces in between, and make title case
df['Name'] = df['Name'].str.strip()                  # remove leading/trailing spaces
df['Name'] = df['Name'].str.replace(r'\s+', ' ', regex=True)  # replace multiple spaces with one
df['Name'] = df['Name'].str.title()                  # make each word capitalized


In [5]:
import re

def clean_movie_name(name):
    name = re.sub(r'\([^)]*\)', '', name)  # remove text in parentheses (e.g., year)
    name = re.sub(r'[^A-Za-z0-9\s]', '', name)  # remove special chars
    return name.strip()

df['Name'] = df['Name'].apply(clean_movie_name)


In [6]:
print(df['Name'].head())


0                  
1            Gadhvi
2        Homecoming
3            Yaaram
4    And Once Again
Name: Name, dtype: object


In [7]:
df.isnull().mean()

Unnamed: 0,0
Name,0.0
Year,0.034045
Duration,0.533174
Genre,0.121027
Rating,0.489393
Votes,0.489329
Director,0.033851
Actor 1,0.104262
Actor 2,0.153717
Actor 3,0.202721


In [8]:
# Fill text columns with empty strings
text_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
df[text_cols] = df[text_cols].fillna("")

# Clean 'Year' column
df['Year'] = df['Year'].str.replace(r'[()]', '', regex=True)

# Clean 'Duration' column
df['Duration'] = df['Duration'].str.replace(' min', '', regex=False)

# Convert numeric columns to numeric type
num_cols = ['Year', 'Duration', 'Rating', 'Votes']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill numeric columns with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack

# 1️⃣ Combine text columns into one string per movie
df['combined_text'] = df['Genre'] + " " + df['Director'] + " " + df['Actor 1'] + " " + df['Actor 2'] + " " + df['Actor 3']

# 2️⃣ Vectorize text features
cv = CountVectorizer()
text_features = cv.fit_transform(df['combined_text'])

# 3️⃣ Scale numeric features
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(df[['Year', 'Duration', 'Rating', 'Votes']])

# 4️⃣ Combine both feature sets
final_features = hstack([text_features, numeric_features])


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(final_features)


In [11]:
def recommend(movie_name):
    # Find movie index
    if movie_name not in df['Name'].values:
        print(f"Movie '{movie_name}' not found in the database.")
        return

    idx = df[df['Name'] == movie_name].index[0]

    # Get similarity scores
    scores = list(enumerate(similarity[idx]))

    # Sort by similarity
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Print top 5 recommendations
    print(f"Recommendations for '{movie_name}':")
    for i in scores[1:6]:
        print(df.iloc[i[0]]['Name'], "-", round(i[1], 2))

In [12]:
recommend('Yaaram')

Recommendations for 'Yaaram':
Around The World - 0.54
Bobby - 0.52
Biwi No 1 - 0.51
Dil Hi To Hai - 0.51
Me And Mrs Khanna - 0.51


In [15]:
import pickle

# Store both df and similarity in one object
data = {
    'movies_df': df,
    'similarity': similarity
}

# Save as one pickle file
with open('indian_movies.pkl', 'wb') as f:
    pickle.dump(data, f)


In [17]:
recommend('Homecoming')

Recommendations for 'Homecoming':
Dhanna - 0.47
Atin Ella And Charadhaya - 0.46
Iftikhar - 0.45
Reincarnation - 0.45
Organic Motion Pictures - 0.45
