In [None]:
import os
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

# Step 1: Path to your dataset

file_path = r"C:/Movie Rec System/ml-100k/u.data"  


# Step 2: Load dataset into Surprise

reader = Reader(line_format="user item rating timestamp", sep="\t")
data = Dataset.load_from_file(file_path, reader=reader)


# Step 3: Train/Test split

trainset, testset = train_test_split(data, test_size=0.2)


# Step 4: Build & train model

model = SVD()
model.fit(trainset)


# Step 5: Evaluate model

predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

print("\n Model training complete!")
print(f"Test RMSE: {rmse:.4f}")


# Step 6: Make Recommendations for one user

def recommend_for_user(user_id, model, data, n=5):
    # Get all items
    trainset = model.trainset
    all_items = trainset.all_items()
    all_item_ids = [trainset.to_raw_iid(i) for i in all_items]

    # Predict ratings for every item not seen by user
    user_seen = set([j for (j, _) in trainset.ur[trainset.to_inner_uid(str(user_id))]])
    predictions = [
        (iid, model.predict(str(user_id), iid).est)
        for iid in all_item_ids if trainset.to_inner_iid(iid) not in user_seen
    ]

    # Sort by predicted rating
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return top_n

# Example: Recommend for user 196
user_id = 196
recommendations = recommend_for_user(user_id, model, data, n=5)

print(f"\n🎬 Top 5 Recommendations for User {user_id}:")
for movie_id, score in recommendations:
    print(f"Movie ID {movie_id} | Predicted Rating: {score:.2f}")


RMSE: 0.9383

✅ Model training complete!
Test RMSE: 0.9383

🎬 Top 5 Recommendations for User 196:
Movie ID 169 | Predicted Rating: 4.51
Movie ID 408 | Predicted Rating: 4.49
Movie ID 511 | Predicted Rating: 4.48
Movie ID 197 | Predicted Rating: 4.47
Movie ID 513 | Predicted Rating: 4.44


In [2]:
import pandas as pd

# path to your dataset
file_path = r"C:/Movie Rec System/ml-100k/u.data"

# Load with pandas (just to check structure)
columns = ["user_id", "movie_id", "rating", "timestamp"]
df = pd.read_csv(file_path, sep="\t", names=columns)

print(df.head())
print(df.shape)


   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
(100000, 4)


In [3]:
from surprise import Dataset, Reader

# Path to your dataset
file_path = r"C:/Movie Rec System/ml-100k/u.data"

# Define the format of the file (order of columns, separator, rating scale)
reader = Reader(line_format="user item rating timestamp", sep="\t", rating_scale=(1, 5))

# Load the dataset
data = Dataset.load_from_file(file_path, reader=reader)

print("MovieLens data loaded into Surprise!")


MovieLens data loaded into Surprise!


In [4]:
from surprise.model_selection import train_test_split

# Split into train (80%) and test (20%)
trainset, testset = train_test_split(data, test_size=0.2)

print("Trainset and testset ready!")


Trainset and testset ready!


In [None]:
from surprise import KNNBasic
from surprise import accuracy

# Using KNN collaborative filtering
algo = KNNBasic()

# Train the model on the trainset
algo.fit(trainset)

# Test the model on the testset
predictions = algo.test(testset)

# Check performance
rmse = accuracy.rmse(predictions)
print("RMSE:", rmse)


Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9773
RMSE: 0.9773056480824995


In [None]:
# Pick a user
user_id = str(1)  

# Get all movie IDs
all_movie_ids = df["movie_id"].unique()

# Get movies the user has already rated
rated_movies = df[df["user_id"] == int(user_id)]["movie_id"].tolist()

# Recommend only the ones not rated
unrated_movies = [m for m in all_movie_ids if m not in rated_movies]

# Predict ratings for these unrated movies
predictions = [algo.predict(user_id, m) for m in unrated_movies]

# Sort by predicted rating
predictions.sort(key=lambda x: x.est, reverse=True)

# Top 5 recommendations
top5 = predictions[:5]

print("Top 5 recommendations for user", user_id)
for p in top5:
    print(f"MovieID: {p.iid}, Predicted Rating: {p.est:.2f}")


Top 5 recommendations for user 1
MovieID: 302, Predicted Rating: 3.53
MovieID: 377, Predicted Rating: 3.53
MovieID: 346, Predicted Rating: 3.53
MovieID: 474, Predicted Rating: 3.53
MovieID: 465, Predicted Rating: 3.53


In [7]:
import pandas as pd

# Load movie metadata (u.item has | separator)
movies = pd.read_csv(
    r"C:\Movie Rec System/ml-100k/u.item",
    sep="|",
    header=None,
    encoding="latin-1",
    usecols=[0, 1],  # movie_id, title
    names=["movie_id", "title"]
)

# Convert to dictionary {movie_id: title}
movie_dict = dict(zip(movies["movie_id"], movies["title"]))


In [8]:
print("Top 5 recommendations for user", user_id)
for p in top5:
    movie_name = movie_dict.get(int(p.iid), "Unknown Movie")
    print(f"{movie_name} (Predicted Rating: {p.est:.2f})")


Top 5 recommendations for user 1
L.A. Confidential (1997) (Predicted Rating: 3.53)
Heavyweights (1994) (Predicted Rating: 3.53)
Jackie Brown (1997) (Predicted Rating: 3.53)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) (Predicted Rating: 3.53)
Jungle Book, The (1994) (Predicted Rating: 3.53)


In [9]:
# u.item has many columns, genres are in last 19 (0/1 for each genre)
genre_cols = [
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", 
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", 
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies_full = pd.read_csv(
    "C:\Movie Rec System/ml-100k/u.item",
    sep="|",
    header=None,
    encoding="latin-1"
)

# Extract only movie_id, title + genres
movies_with_genres = movies_full.iloc[:, [0,1] + list(range(5, 24))]
movies_with_genres.columns = ["movie_id", "title"] + genre_cols


  "C:\Movie Rec System/ml-100k/u.item",


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Just take the genre vectors
genre_features = movies_with_genres[genre_cols].values

# Compute cosine similarity
cosine_sim = cosine_similarity(genre_features, genre_features)


In [11]:
def recommend_by_genre(movie_title, top_n=5):
    # Find the index of the movie
    idx = movies_with_genres[movies_with_genres["title"] == movie_title].index[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top_n similar movies (skip the first one because it's the same movie)
    sim_scores = sim_scores[1:top_n+1]
    
    movie_indices = [i[0] for i in sim_scores]
    return movies_with_genres.iloc[movie_indices][["title"]]


In [None]:
import numpy as np

def hybrid_recommend(user_id, movie_title, top_n=5, alpha=0.7):
    """
    Hybrid recommender that combines collaborative filtering and content-based filtering.
    
    user_id: the ID of the user we’re recommending for
    movie_title: a movie the user already liked (for content similarity)
    top_n: number of recommendations
    alpha: weight for collaborative filtering (0.0–1.0)
           (alpha closer to 1 → CF dominates, closer to 0 → CBF dominates)
    """
    
    #  Content-based part 
    idx = movies_with_genres[movies_with_genres["title"] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]   # take top 50 similar movies
    movie_indices = [i[0] for i in sim_scores]
    cbf_movies = movies_with_genres.iloc[movie_indices][["movie_id", "title"]]
    cbf_scores = np.array([s[1] for s in sim_scores])
    
    #  Collaborative part 
    cf_scores = []
    for mid in cbf_movies["movie_id"]:
        try:
            pred = algo.predict(user_id, mid).est
        except:
            pred = 0
        cf_scores.append(pred)
    cf_scores = np.array(cf_scores)
    
    #  Combine 
    final_scores = alpha * cf_scores + (1 - alpha) * cbf_scores
    
    # Get top_n results
    cbf_movies["score"] = final_scores
    cbf_movies = cbf_movies.sort_values("score", ascending=False)
    
    return cbf_movies.head(top_n)[["title", "score"]]
