In [1]:
import pandas as pd

# Load Ratings
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load User Info
users = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Load Movie Info
items = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1',
                    names=['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
                           'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
                           'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
                           'War', 'Western'])

# Quick checks
print(ratings.head())
print(users.head())
print(items[['item_id','title','Action','Comedy','Romance']].head())


   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
   user_id  age gender  occupation zip_code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213
   item_id              title  Action  Comedy  Romance
0        1   Toy Story (1995)       0       1        0
1        2   GoldenEye (1995)       1       0        0
2        3  Four Rooms (1995)       0       0        0
3        4  Get Shorty (1995)       1       1        0
4        5     Copycat (1995)       0       0        0


In [2]:
# Create user-movie matrix
user_movie_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)

# User similarity (can demo with a few users)
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(user_movie_matrix)

# Show similarity between the first 5 users
pd.DataFrame(user_similarity[:5,:5], columns=range(1,6), index=range(1,6))


Unnamed: 0,1,2,3,4,5
1,1.0,0.166931,0.04746,0.064358,0.378475
2,0.166931,1.0,0.110591,0.178121,0.072979
3,0.04746,0.110591,1.0,0.344151,0.021245
4,0.064358,0.178121,0.344151,1.0,0.031804
5,0.378475,0.072979,0.021245,0.031804,1.0


In [3]:
genre_cols = ['Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
              'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Merge genres with ratings
ratings_with_genres = ratings.merge(items[['item_id'] + genre_cols], on='item_id')

# User profile: average genre preference for liked movies (rating >=4)
user_profiles = ratings_with_genres[ratings_with_genres['rating'] >= 4].groupby('user_id')[genre_cols].mean()
user_profiles.head()


Unnamed: 0_level_0,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.239264,0.104294,0.030675,0.030675,0.300613,0.092025,0.030675,0.478528,0.006135,0.006135,0.042945,0.03681,0.018405,0.184049,0.196319,0.184049,0.092025,0.018405
2,0.175,0.075,0.025,0.05,0.3,0.15,0.0,0.575,0.0,0.05,0.0,0.0,0.025,0.325,0.05,0.15,0.05,0.0
3,0.2,0.133333,0.0,0.0,0.2,0.266667,0.066667,0.533333,0.0,0.0,0.0,0.0,0.266667,0.133333,0.133333,0.333333,0.133333,0.0
4,0.263158,0.105263,0.0,0.0,0.210526,0.210526,0.052632,0.263158,0.0,0.0,0.052632,0.052632,0.157895,0.105263,0.210526,0.368421,0.105263,0.0
5,0.396552,0.241379,0.137931,0.086207,0.568966,0.12069,0.0,0.12069,0.017241,0.017241,0.137931,0.068966,0.017241,0.103448,0.327586,0.086207,0.12069,0.017241


In [8]:
user_id = 5  # Try any user

# Collaborative Filtering: find similar users
similar_users = user_similarity[user_id-1]  # Matrix is zero-indexed
top_similar = similar_users.argsort()[-6:-1][::-1] + 1  # Top 5, exclude self

print("Top 5 similar users:", top_similar)

# Content-Based: recommend movies closest to the user's genre profile
user_profile = user_profiles.loc[user_id]
item_profiles = items[genre_cols]
scores = item_profiles.dot(user_profile)
top_items = scores.sort_values(ascending=False).head(5).index

print("Top content-based recommendations:", items.loc[top_items, 'title'].values)


Top 5 similar users: [307 648 407 497 660]
Top content-based recommendations: ['Army of Darkness (1993)' 'Men in Black (1997)' 'Mars Attacks! (1996)'
 'Tank Girl (1995)' 'Muppet Treasure Island (1996)']
