In [1]:
from google.colab import files
uploaded = files.upload()  # Choose TeluguMovies_dataset.csv



Saving TeluguMovies_dataset.csv to TeluguMovies_dataset.csv


In [2]:
import pandas as pd

# Read the CSV
df = pd.read_csv('TeluguMovies_dataset.csv')

# Show first few rows
df.head()


Unnamed: 0.1,Unnamed: 0,Movie,Year,Certificate,Genre,Overview,Runtime,Rating,No.of.Ratings
0,0,Bahubali: The Beginning,2015.0,UA,"Action, Drama","In ancient India, an adventurous and darin...",159,8.1,99114
1,1,Baahubali 2: The Conclusion,2017.0,UA,"Action, Drama","When Shiva, the son of Bahubali, learns ab...",167,8.2,71458
2,2,1 - Nenokkadine,2014.0,UA,"Action, Thriller",A rock star must overcome his psychologica...,170,8.1,42372
3,3,Dhoom:3,2013.0,UA,"Action, Thriller","When Sahir, a circus entertainer trained i...",172,5.4,42112
4,4,Ra.One,2011.0,U,"Action, Adventure, Sci-Fi",When the titular antagonist of an action g...,156,4.6,37211


In [3]:
# Fill missing values
df['Overview'] = df['Overview'].fillna('')
df['Genre'] = df['Genre'].fillna('')

# Create a 'content' column (Overview + Genre)
df['content'] = df['Overview'] + ' ' + df['Genre']


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the content column
tfidf_matrix = tfidf.fit_transform(df['content'])

# Optional: check the shape
print("TF-IDF shape:", tfidf_matrix.shape)


TF-IDF shape: (1400, 5210)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between all movie vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [6]:
# Reset index to make sure it's clean
df = df.reset_index(drop=True)

# Create a mapping of movie titles to indices
indices = pd.Series(df.index, index=df['Movie'].str.lower()).drop_duplicates()


In [7]:
def recommend(title, cosine_sim=cosine_sim):
    title = title.lower()

    if title not in indices:
        return "❌ Movie not found in dataset."

    idx = indices[title]

    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity score (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]

    # Get top 5 indices
    movie_indices = [i[0] for i in sim_scores]

    # Return similar movies with Genre and Year
    return df[['Movie', 'Genre', 'Year']].iloc[movie_indices]


In [8]:
recommend("U Turn")


Unnamed: 0,Movie,Genre,Year
801,Srimannarayana,Drama,2012.0
230,Billa,"Action, Thriller",2009.0
1007,Rowdy Inspector,"Action, Drama",1992.0
240,Anukokunda Oka Roju,"Mystery, Thriller",2005.0
1366,Neeku Naaku Dash Dash,Thriller,2012.0
