In [1]:
from google.colab import files
import pandas as pd
uploaded = files.upload()

Saving u.data to u.data
Saving u.item to u.item
Saving u.user to u.user


In [2]:
ratings = pd.read_csv('u.data', sep='\t', names=['user_id','item_id','rating','timestamp'])
movies = pd.read_csv('u.item', sep='|', encoding='latin-1', header=None, usecols=[0,1])
users = pd.read_csv('u.user', sep='|', names=['user_id','age','gender','occupation','zip_code'])

print("Ratings:", ratings.shape)
print("Movies:", movies.shape)
print("Users:", users.shape)

Ratings: (100000, 4)
Movies: (1682, 2)
Users: (943, 5)


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# 80% آموزش، 20% تست
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# ساخت ماتریس User-Item برای آموزش
train_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

print("Train matrix shape:", train_matrix.shape)

Train matrix shape: (943, 1653)


In [4]:
item_user_matrix = train_matrix.T

item_similarity = cosine_similarity(item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=train_matrix.columns, columns=train_matrix.columns)

print("Item-Item similarity matrix shape:", item_similarity_df.shape)
item_similarity_df.head()


Item-Item similarity matrix shape: (1653, 1653)


item_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.311393,0.25369,0.350312,0.214229,0.081377,0.520601,0.376806,0.395321,0.208464,...,0.0,0.0,0.0,0.0,0.040426,0.0,0.0,0.0,0.0,0.0
2,0.311393,1.0,0.216764,0.383544,0.304612,0.0,0.297186,0.308952,0.222845,0.121722,...,0.0,0.0,0.0,0.062446,0.0,0.0,0.0,0.0,0.0,0.088312
3,0.25369,0.216764,1.0,0.261066,0.14157,0.058716,0.271301,0.167238,0.224092,0.120622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.350312,0.383544,0.261066,1.0,0.270286,0.061803,0.366117,0.370546,0.265128,0.190169,...,0.0,0.0,0.0,0.043872,0.0,0.103406,0.0,0.0,0.0,0.062044
5,0.214229,0.304612,0.14157,0.270286,1.0,0.017833,0.246926,0.196619,0.199997,0.036237,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
import numpy as np
global_mean = train_data['rating'].mean()
item_mean_series = train_data.groupby('item_id')['rating'].mean()
item_mean = item_mean_series.to_dict()
user_mean_series = train_data.groupby('user_id')['rating'].mean()
user_mean = user_mean_series.to_dict()

def predict_rating_safe(user_id, item_id, user_item_matrix, item_similarity_df, k=50):
    """
    Robust Item-Item prediction with simple fallbacks:
     - if item not in similarity matrix -> return item_mean or global_mean
     - if user not in user_item_matrix -> return item_mean or global_mean
     - if user has no rated neighbors -> return item_mean or global_mean
    """
    if item_id not in item_similarity_df.columns:
        return item_mean.get(item_id, global_mean)
    if user_id not in user_item_matrix.index:
        return item_mean.get(item_id, global_mean)
    sim_scores = item_similarity_df[item_id]
    user_ratings = user_item_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index.intersection(item_similarity_df.columns)
    if len(rated_items) == 0:
        return item_mean.get(item_id, global_mean)

    sim_scores = sim_scores[rated_items].sort_values(ascending=False)[:k]

    sim_scores = sim_scores[sim_scores > 0]
    if sim_scores.empty or sim_scores.sum() == 0:
        return item_mean.get(item_id, global_mean)

    weighted_sum = np.dot(sim_scores, user_ratings[sim_scores.index])
    return weighted_sum / sim_scores.sum()

In [6]:
def recommend_top_n_safe(user_id, user_item_matrix, item_similarity_df, n=10, k=50):
    if user_id not in user_item_matrix.index:
        popular = train_data.groupby('item_id')['rating'].count().sort_values(ascending=False).index.tolist()
        topk = popular[:n]
        return [(it, item_mean.get(it, global_mean)) for it in topk]

    user_ratings = user_item_matrix.loc[user_id]
    unrated_items = user_ratings[user_ratings == 0].index.intersection(item_similarity_df.columns)
    predictions = {}
    for item in unrated_items:
        predictions[item] = predict_rating_safe(user_id, item, user_item_matrix, item_similarity_df, k=k)
    top_n_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:n]
    return top_n_items

print("Top 10 for user 1:", recommend_top_n_safe(1, train_matrix, item_similarity_df, n=10))

Top 10 for user 1: [(1671, np.float64(4.535520418144707)), (753, np.float64(4.457303189164939)), (847, np.float64(4.453751144876662)), (1104, np.float64(4.430904802592626)), (311, np.float64(4.428519368403658)), (709, np.float64(4.4221826440984415)), (813, np.float64(4.413596429262568)), (733, np.float64(4.401212117266408)), (32, np.float64(4.397824108750156)), (896, np.float64(4.396211322155052))]


In [7]:
predictions = []
for row in test_data.itertuples():
    user = row.user_id
    item = row.item_id
    true_rating = row.rating

    pred_rating = predict_rating_safe(user, item, train_matrix, item_similarity_df, k=50)
    predictions.append((true_rating, pred_rating))
predictions[:10]


[(4, np.float64(3.7538483082070506)),
 (3, np.float64(4.222098948634675)),
 (4, np.float64(3.4449453194198236)),
 (2, np.float64(3.8393822431755504)),
 (2, np.float64(3.669029562523533)),
 (3, np.float64(3.9223984447266105)),
 (5, np.float64(3.7098052882825785)),
 (4, np.float64(3.8015543273590486)),
 (3, np.float64(3.0605767150832)),
 (4, np.float64(3.4115240090213055))]

In [8]:
true_ratings = [t[0] for t in predictions]
pred_ratings = [t[1] for t in predictions]

rmse = np.sqrt(mean_squared_error(true_ratings, pred_ratings))
print("RMSE:", rmse)

RMSE: 0.9864781986501955
