# Movies Recommendations #


In [9]:
import pandas as pd


In [10]:
with open('ratings.dat', 'r', encoding = 'latin1') as f:
    for i in range(5):
      line = f.readline() # 2562::Bandits (1997)::Drama
      print(repr(line))
# Proper columns:
  # movies.dat: Movie ID, Title, Genre
  # ratings.dat: UserID, Movie ID, Rating, Timestamp
  # users.dat: UserID, Gender, Age,

'1::1193::5::978300760\n'
'1::661::3::978302109\n'
'1::914::3::978301968\n'
'1::3408::4::978300275\n'
'1::2355::5::978824291\n'


In [11]:
# Data Preprocessing: read data, add columns, print
f_movies = pd.read_csv('movies.dat',sep='::',encoding = 'latin1', engine = 'python', header = None, names = ['Movie ID', 'Title', 'Genre'])
f_ratings = pd.read_csv('ratings.dat',sep='::',encoding = 'latin1',engine = 'python', header = None, names = ['User ID','Movie ID', 'Rating', 'Timestamp'])
f_users = pd.read_csv('users.dat',sep='::',encoding = 'latin1')
print(f_movies.head(), f_movies.shape) # (3883, 3)
print(f_ratings.head(), f_ratings.shape) # (1000209, 4)
print(f_users.head(), f_users.shape)

   Movie ID                               Title                         Genre
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy (3883, 3)
   User ID  Movie ID  Rating  Timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291 (1000209, 4)
   1  F  1.1  10  48067
0  2  M   56  16  70072
1  3  M   25  15  55117
2  4  M   45   7  02460
3  5  M   25  20  55455
4  6  F   50   9  55117 (6039, 5)


  f_users = pd.read_csv('users.dat',sep='::',encoding = 'latin1')


##Part A: Top 10 Most Popular Movies
I define popularity using IMDB-style **weighted rating**, which balances a movie's own average rating with the gloval mean rating, while regularizing the influence of movies that receive few ratings.\
Parameters Used:


*   R = per-movie mean rating
*   v = number of ratings per movie
*   C = global mean rating over all ratings

*   m = minimum number of ratings thresholds = 25 percentile of v

Intuition:
*  When v is large, the weighted score relies more heavily on the movie’s own rating R
*  When v is small, the score is pulled toward the global mean C, reducing the effect of noisy or unreliable averages.

This approach fairly compares both popular and less-watched films by accounting for sample size.

**Result: using the metrics, the Top 10 most popular movies are**

309                    Shawshank Redemption, The (1994)   
1839  Seven Samurai (The Magnificent Seven) (Shichin...   
802                               Godfather, The (1972)   
49                           Usual Suspects, The (1995)   
513                             Schindler's List (1993)   
708                               Close Shave, A (1995)   
1066                         Wrong Trousers, The (1993)   
1108                     Raiders of the Lost Ark (1981)   
843                                  Rear Window (1954)   
253           Star Wars: Episode IV - A New Hope (1977)


In [12]:
R = f_ratings.groupby('Movie ID')['Rating'].mean()
v = f_ratings.groupby('Movie ID')['User ID'].count()
movie_stats = pd.DataFrame({'avg_rating': R, 'avg_count': v})
# print(movie_stats.head())
movie_stats.describe()
# average rating: 3.24, avg_count: 270

m = movie_stats['avg_count'].quantile(0.25) # 33
C = f_ratings['Rating'].mean() #3.5

# weighted popularity score
movie_stats['weighted_score'] = (v/(v+m))*R + (m/(v+m))*C
movie_stats['weighted_score'].describe()
# weighted score mean: 3.4

# Top 10
movie_stats.reset_index(inplace=True)
# movie_stats.sort_values('weighted_score', ascending = False).head(10)
result = pd.merge(movie_stats, f_movies, on = 'Movie ID').sort_values('weighted_score', ascending = False).head(10)
print(result)

      Movie ID  avg_rating  avg_count  weighted_score  \
309        318    4.554558       2227        4.540350   
1839      2019    4.560510        628        4.511636   
802        858    4.524966       2223        4.511167   
49          50    4.517106       1783        4.500106   
513        527    4.510417       2304        4.497301   
708        745    4.520548        657        4.475640   
1066      1148    4.507937        882        4.474526   
1108      1198    4.477725       2514        4.466114   
843        904    4.476190       1050        4.448930   
253        260    4.453694       2991        4.444177   

                                                  Title  \
309                    Shawshank Redemption, The (1994)   
1839  Seven Samurai (The Magnificent Seven) (Shichin...   
802                               Godfather, The (1972)   
49                           Usual Suspects, The (1995)   
513                             Schindler's List (1993)   
708               

##Part B: Make a Recommendation Based on IBCF

Although user.dat contains 6039 users, only 1114 users appear in the ratings dataset. The rating matrix is 1114 x 3426 including only users with actual rating activity.

In [13]:
# Matrix creation
R = pd.pivot_table(f_ratings,index = 'User ID',
                                columns = 'Movie ID', values = 'Rating')
# Centering each row
user_mean = R.mean(axis=1) # exclude NA value
R_centered = R.sub(user_mean, axis=0)
R_centered.head()


Movie ID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.811321,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,-1.146465,,,,,...,,,,,,,,,,


In [15]:
from numpy import dot
import numpy as np
# Consine similarity

# Create consine similarity function
def item_cosine(i, j, R_centered, min_common = 3):
    col_i = R_centered[i]
    col_j = R_centered[j]
    mask = col_i.notna() & col_j.notna()
    vec_i = col_i[mask]
    vec_j = col_j[mask]
    if len(vec_j) < min_common:
      return np.nan
    dot_product = vec_i.dot(vec_j)
    norm_i = np.sqrt(sum(vec_i**2))
    norm_j = np.sqrt(sum(vec_j**2))
    if norm_i == 0 or norm_j == 0:
      return np.nan
    cosine_sim = dot_product/(norm_i * norm_j)
    result = (1+cosine_sim)/2
    return result


movie_ids = R_centered.columns
# S = pd.DataFrame(np.nan, index=movie_ids, columns = movie_ids)
# for i in movie_ids:
#   for j in movie_ids:
#     if i <= j:
#       continue
#     sim = item_cosine(i, j, R_centered)
#     if not np.isnan(sim):
#       S.loc[i,j] = sim
#       S.loc[j,i] = sim
# S.head()
# movie_users = {mid: set(R_centered[mid].dropna().index) for mid in movie_ids}
# S = pd.DataFrame(np.nan, index=movie_ids, columns=movie_ids)


# for idx_i, i in enumerate(movie_ids):
#     for j in movie_ids[idx_i+1:]:

#         # 快速 filter：共同用户少于 3 → 直接跳过
#         if len(movie_users[i] & movie_users[j]) < 3:
#             continue

#         sim = item_cosine(i, j, R_centered)

#         S.loc[i, j] = sim
#         S.loc[j, i] = sim
import numpy as np


X = R_centered.to_numpy()       # shape: (num_users, num_movies)

mask = ~np.isnan(X)             # True = 有评分


X_filled = np.nan_to_num(X, nan=0.0)


norms = np.linalg.norm(X_filled, axis=0)   # shape: (num_movies,)


dot_matrix = X_filled.T @ X_filled         # shape: (num_movies, num_movies)


den = np.outer(norms, norms)               # same shape as dot_matrix


cosine = dot_matrix / den
cosine[den == 0] = np.nan                  # 避免除以 0

common_counts = mask.T @ mask              # int 矩阵


cosine[common_counts < 3] = np.nan


sim01 = (1 + cosine) / 2


S = pd.DataFrame(sim01, index=movie_ids, columns=movie_ids)



In [16]:
print(S.shape)

(3706, 3706)


In [31]:
def keep_top_k(row, k=30):
  #sort the non-NA similarity measures and keep the top 30, setting the rest to NA
  topk = row.nlargest(k)
  new_row = row.where(row.isin(topk), np.nan)
  return new_row

S_new = S.apply(keep_top_k, axis=1)
S_new.head()

# f_movies: ['Movie ID', 'Title', 'Genre']
target_titles = [
    "Toy Story (1995)",
    "GoldenEye (1995)",
    "Liar Liar (1997)",
    "Lost World: Jurassic Park, The (1997)",
    "Sixth Sense, The (1999)",
]

sub = f_movies[f_movies['Title'].isin(target_titles)][['Movie ID', 'Title']]
ids = sub['Movie ID'].tolist()
S_sub = S_new.loc[ids, ids]
id_to_title = dict(zip(sub['Movie ID'], sub['Title']))
S_sub_named = S_sub.rename(index=id_to_title, columns=id_to_title)
print(S_sub_named)






Movie ID                               Toy Story (1995)  GoldenEye (1995)  \
Movie ID                                                                    
Toy Story (1995)                                    NaN               NaN   
GoldenEye (1995)                                    NaN               NaN   
Liar Liar (1997)                                    NaN               NaN   
Lost World: Jurassic Park, The (1997)               NaN               NaN   
Sixth Sense, The (1999)                             NaN               NaN   

Movie ID                               Liar Liar (1997)  \
Movie ID                                                  
Toy Story (1995)                                    NaN   
GoldenEye (1995)                                    NaN   
Liar Liar (1997)                                    NaN   
Lost World: Jurassic Park, The (1997)               NaN   
Sixth Sense, The (1999)                             NaN   

Movie ID                               Lost Wo

In [32]:
import numpy as np
import pandas as pd

def ibcf_recommend(newuser, S_sim, popularity_df, f_movies, top_k=10):



    rated_mask = newuser.notna()
    rated_items = newuser.index[rated_mask]
    candidate_items = newuser.index[~rated_mask]


    predictions = {}

    for i in candidate_items:

        if i not in S_sim.index:
            continue

        sim_row = S_sim.loc[i, :]


        neighbor_mask = rated_mask & sim_row.notna()
        if neighbor_mask.sum() == 0:
            continue

        sims = sim_row[neighbor_mask]          # s_ij
        ratings = newuser[neighbor_mask]       # r_aj

        num = (sims * ratings).sum()           # Σ s_ij * r_aj
        den = sims.sum()                       # Σ s_ij  （原文公式用 sum，不取 abs）

        if den == 0:
            continue

        r_hat = num / den
        predictions[i] = r_hat


    pred_series = pd.Series(predictions)
    pred_series = pred_series.sort_values(ascending=False)


    top_pred = pred_series.head(top_k)
    num_pred = len(top_pred)


    pop_sorted = popularity_df.sort_values('weighted_score', ascending=False)
    popular_ids = pop_sorted['Movie ID']


    already_used = set(top_pred.index) | set(rated_items)


    fallback_ids = [mid for mid in popular_ids if mid not in already_used]


    if num_pred >= top_k:
        final_ids_pred = list(top_pred.index)
        fallback_ids = []
    else:
        num_needed = top_k - num_pred
        fallback_ids = fallback_ids[:num_needed]
        final_ids_pred = list(top_pred.index) + fallback_ids


    pred_df = top_pred.reset_index()
    pred_df.columns = ['Movie ID', 'predicted_rating']

    if fallback_ids:
        fallback_df = pd.DataFrame({'Movie ID': fallback_ids})
        fallback_df['predicted_rating'] = np.nan
        recs_df = pd.concat([pred_df, fallback_df], ignore_index=True)
    else:
        recs_df = pred_df.copy()


    recs_df = recs_df.merge(
        popularity_df[['Movie ID', 'weighted_score']],
        on='Movie ID',
        how='left'
    )


    recs_df = recs_df.merge(
        f_movies[['Movie ID', 'Title']],
        on='Movie ID',
        how='left'
    )


    recs_df = recs_df.head(top_k)
    recs_df = recs_df.sort_values(
        by=['predicted_rating', 'weighted_score'],
        ascending=[False, False],
        na_position='last'
    ).reset_index(drop=True)

    return recs_df


In [44]:

user_row_1500 = R.iloc[1499]

popularity_df = movie_stats.reset_index()[['Movie ID', 'weighted_score']]


recs_user_1500 = ibcf_recommend(
    newuser=user_row_1500,
    S_sim=S_new,
    popularity_df=popularity_df,
    f_movies=f_movies,
    top_k=10
)

print("Top-10 recommendations for user in row 1500:")
print(recs_user_1500)


hypo_user = pd.Series(index=R.columns, dtype=float)
hypo_user[:] = np.nan


hypo_user['260'] = 5.0
hypo_user['780']       = 4.0
recs_hypo = ibcf_recommend(
    newuser=hypo_user,
    S_sim=S_new,
    popularity_df=popularity_df,
    f_movies=f_movies,
    top_k=10
)

print("Top-10 recommendations for the hypothetical user:")
print(recs_hypo)



Top-10 recommendations for user in row 1500:
   Movie ID predicted_rating  weighted_score  \
0      2019              NaN        4.511636   
1       858              NaN        4.511167   
2       527              NaN        4.497301   
3       745              NaN        4.475640   
4      1148              NaN        4.474526   
5      1198              NaN        4.466114   
6       904              NaN        4.448930   
7       260              NaN        4.444177   
8       922              NaN        4.431792   
9       750              NaN        4.429423   

                                               Title  
0  Seven Samurai (The Magnificent Seven) (Shichin...  
1                              Godfather, The (1972)  
2                            Schindler's List (1993)  
3                              Close Shave, A (1995)  
4                         Wrong Trousers, The (1993)  
5                     Raiders of the Lost Ark (1981)  
6                                 Rear Wi