In [43]:
import pandas as pd
import numpy as np

data = pd.read_csv('u.data', sep='\t')

In [44]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [45]:
from sklearn.model_selection import train_test_split

# Создаем тренировочную и тестовую выборки
train_data = []
test_data = []

for user in data['user_id'].unique():
    user_ratings = data[data['user_id'] == user]
    if len(user_ratings) < 5:
        continue  # мало данных — пропускаем

    train_u, test_u = train_test_split(user_ratings, test_size=0.2, random_state=42)
    train_data.append(train_u)
    test_data.append(test_u)

train = pd.concat(train_data)
test = pd.concat(test_data)


In [23]:
movie = pd.read_csv('u.item.csv', sep='|')
movie.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Unnamed: 24
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,


In [46]:
user_item = pd.pivot_table(train, index=['user_id'], columns=['item_id'], values='rating')
print(user_item)
# Пропуски = фильмы, которые пользователь не оценивал

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   NaN   4.0   NaN   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  ...   

item_id  16

In [47]:
from sklearn.metrics.pairwise import cosine_similarity

# СРАВНЕНИЕ TARGET_USER С USERS
user_item_filled = user_item.fillna(0)

# Считаем косинусную близость между всеми пользователями
user_similarity = cosine_similarity(user_item_filled)

user_similarity_df = pd.DataFrame(user_similarity, index=user_item.index, columns=user_item.index)

# Посмотри на матрицу схожести
print(user_similarity_df.head())

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.152349  0.048171  0.022019  0.299385  0.364007  0.322829   
2        0.152349  1.000000  0.087032  0.047745  0.049858  0.179185  0.050813   
3        0.048171  0.087032  1.000000  0.350752  0.026884  0.050030  0.048190   
4        0.022019  0.047745  0.350752  1.000000  0.012212  0.054798  0.054284   
5        0.299385  0.049858  0.026884  0.012212  1.000000  0.190959  0.270403   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.248686  0.098266  0.297693  ...  0.275895  0.099541  0.254412   
2        0.019962  0.164051  0.117394  ...  0.140517  0.256546  0.259360   
3        0.072764  0.000000  0.055858  ...  0.032070  0.000000  0.113153   
4        0.128065  0.000000  0.035240  ...  0.036996

In [48]:
def recommend_movie(target_user, k = 5, n_recommendations = 5):

    # самые похожие юзеры по убыванию \ похожие пользователи для target_user \ берём топ-k наиболее похожих пользователей
    similar_users = user_similarity_df[target_user].drop(index=target_user).sort_values(ascending=False).head(k)
    
    weighted_scores = pd.Series(dtype=float)
    for sim_user, similarity in similar_users.items():
        #возвращает рейтинги, которые поставил этот пользователь.
        sim_user_ratings = user_item.loc[sim_user]
        # реально поставил
        sim_user_ratings = sim_user_ratings[sim_user_ratings.notna()]
        
        # чем больше user похож на target_user тем больше центися его рейтинг
        weighted = sim_user_ratings * similarity
        weighted_scores = weighted_scores.add(weighted, fill_value=0)

    # Убираем фильмы, которые target_user уже смотрел
    target_user_seen = user_item.loc[target_user]
    already_seen = target_user_seen[target_user_seen.notna()].index
    weighted_scores = weighted_scores.drop(index=already_seen, errors='ignore')

    # Топ-N рекомендаций
    return weighted_scores.sort_values(ascending=False).head(n_recommendations)
        

In [49]:
recommendations = recommend_movie(target_user=1, k=5, n_recommendations=5)
print("Recommended movie IDs:\n", recommendations)


Recommended movie IDs:
 item_id
238    9.849354
83     9.013783
134    8.598976
582    8.143747
479    8.121464
dtype: float64


In [50]:
# Получить названия фильмов:
movie_titles = movie.set_index('movie id ')
print("\nMovie Titles:")
print(movie_titles.loc[recommendations.index][' movie title '])


Movie Titles:
item_id
238           Raising Arizona (1987)
83     Much Ado About Nothing (1993)
134              Citizen Kane (1941)
582                Piano, The (1993)
479                   Vertigo (1958)
Name:  movie title , dtype: object


In [51]:
def precision_at_k(target_user, k=5, n_recommendations=5):
    # Получаем рекомендации
    recommendations = recommend_movie(target_user, k=k, n_recommendations=n_recommendations)
    if recommendations.empty:
        return np.nan

    # Получаем реальные фильмы, которые пользователь оценивал в тесте
    true_movies = test[test['user_id'] == target_user]['item_id'].values

    # Считаем пересечение
    recommended_ids = recommendations.index.values
    true_positive = set(recommended_ids) & set(true_movies)

    return len(true_positive) / len(recommended_ids)


In [52]:
precisions = []
users = test['user_id'].unique()

for user in users:
    try:
        p = precision_at_k(user, k=5, n_recommendations=5)
        if not np.isnan(p):
            precisions.append(p)
    except KeyError:
        continue  # если пользователь не попал в train (например, все его оценки в test)

print(f"Средний Precision: {np.mean(precisions):.3f}")


Средний Precision@5: 0.312
