# 유저의 별점 예측하기 (Rating_prediction)

- surprise를 활용한 유저기반의 별점 예측 추천시스템

```
uv add scikit-surprise
uv pip uninstall numpy
uv pip install "numpy<2"
```

- https://visualstudio.microsoft.com/ko/visual-cpp-build-tools/
- 데스크탑 버전
- python ver 3.11 로 변경해야함 (python-version, pyproject.toml 파일)
- .venv 지우고, uv sync
- uv add scikit-surprise > uv pip uninstall numpy > uv pip install "numpy<2"

In [3]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader
from surprise.dataset import DatasetAutoFolds
import pandas as pd

In [4]:
ratings = pd.read_csv("data/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
ratings.to_csv("data/ratings_nohead.csv", index=False, header=False)

In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [7]:
ratings["rating"].unique()

array([2.5, 3. , 2. , 4. , 3.5, 1. , 5. , 4.5, 1.5, 0.5])

In [8]:
reader = Reader(line_format="user item rating timestamp", sep=",", rating_scale=(0.5, 5))
data_folds = DatasetAutoFolds(ratings_file="data/ratings_nohead.csv", reader=reader)
trainset = data_folds.build_full_trainset()     # 100프로 trainset으로 (testset 만들지않고)

model = SVD(n_factors=50, random_state=42)      # n_factors : 차원 개수 -> 50차원의 잠재요인 공간에 학습시킴
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x207f6b7b9d0>

In [9]:
y_pred = model.predict("9", "3")    # 사용자 "9"가 아이템 "3"에 대해 어떤 평점을 줄지
y_pred

# est : 예측값 (이 사람이 영화를 본다면 3.09점의 평점을 내릴것 같다)

Prediction(uid='9', iid='3', r_ui=None, est=3.092324541202098, details={'was_impossible': False})

In [10]:
movies = pd.read_csv("data/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
ratings[ratings["userId"] == 9]["movieId"]  # 9번 유저 데이터에서 movieId 만 가져옴 -> 본 영화를 가져옴
movies["movieId"]

# 안 본 영화를 찾으려면 : 전체 영화 - 내가 본 것 (집합)
set(movies["movieId"]) - set(ratings[ratings["userId"] == 9]["movieId"])

{2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 27,
 28,
 29,
 30,
 31,
 32,
 32797,
 34,
 35,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 48,
 49,
 50,
 65585,
 52,
 53,
 54,
 55,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 65601,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 32840,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 32853,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 65638,
 107,
 108,
 65642,
 110,
 111,
 112,
 113,
 114,
 32882,
 116,
 117,
 118,
 119,
 121,
 122,
 123,
 124,
 125,
 126,
 32892,
 129,
 130,
 131,
 132,
 32898,
 65665,
 135,
 137,
 98441,
 140,
 141,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 32943,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 

In [12]:
# 특정유저(userID)가 안본 영화 찾기
def get_unseen_surprise(userId) :
    return list(set(movies["movieId"]) - set(ratings[ratings["userId"] == userId]["movieId"]))

In [13]:
unseen_movies = get_unseen_surprise(33)

In [14]:
movies[movies["movieId"] == int(7)]["title"]

6    Sabrina (1995)
Name: title, dtype: object

In [15]:
pred_result = []

for movieId in unseen_movies :
    result = model.predict("9", str(movieId))
    data = {}                   # {"id":1, "rating":3.5, "title":"Avatar"}
    data["id"] = result.iid     # iid : 내가 평점을 매길 영화 id
    data["rating"] = result.est # est : 평점 예측값
    data["title"] = movies[movies["movieId"] == int(result.iid)]["title"].iloc[0]
    pred_result.append(data)

result_df = pd.DataFrame(pred_result)
result_df.sort_values("rating", ascending=False)[0:5]

Unnamed: 0,id,rating,title
851,969,4.498787,"African Queen, The (1951)"
755,858,4.453779,"Godfather, The (1972)"
797,913,4.450571,"Maltese Falcon, The (1941)"
1063,1221,4.434328,"Godfather: Part II, The (1974)"
2956,3462,4.390097,Modern Times (1936)
