# Predict by KNN

- 게임 정보로 클러스터링(코사인 유사도 거리측정)
- 유저의 추천 게임과 거리 가장 가까운 거리의 게임 추천

## 알고리즘 순서
- 데이터셋 로드
- Scailing
- 테스트 데이터 분리
- KNN 학습 및 예측
- Evaluate


## 데이터셋 로드

games와 recommend 데이터셋을 결합하여 해당 게임을 추천했는지, 안했는지 여부로

In [84]:

import numpy as np
import pandas as pd

# Load the data
data = pd.read_csv('../datasets/preprocessed_games.csv')

data = data.drop(['Unnamed: 0'], axis=1)

# convert date_realese columns to unixtime
data['date_release'] = pd.to_datetime(data['date_release'])
data['date_release'] = data['date_release'].apply(lambda x: int(x.timestamp()))

ids = data['app_id']
data = data.drop(['app_id'], axis=1)
titles = data['title']
data = data.drop(['title'], axis=1)
id_title = pd.concat([ids, titles], axis=1)
print(id_title.head())

print(data.shape)
print(data.head())
print(data.columns)
print(data.dtypes)
print(data.describe())


   app_id                              title
0   13500  Prince of Persia: Warrior Within™
1   22364            BRINK: Agents of Change
2  113020       Monaco: What's Yours Is Mine
3  226560                 Escape Dead Island
4  249050            Dungeon of the ENDLESS™
(50872, 11)
   date_release  win  mac  linux  rating  positive_ratio  user_reviews  \
0    1227225600    1    0      0       7              84          2199   
1    1312329600    1    0      0       5              85            21   
2    1366761600    1    1      1       7              92          3722   
3    1416268800    1    0      0       4              61           873   
4    1414368000    1    1      0       7              88          8784   

   price_final  price_original  discount  steam_deck  
0         9.99            9.99       0.0           1  
1         2.99            2.99       0.0           1  
2        14.99           14.99       0.0           1  
3        14.99           14.99       0.0           1 

## Scailing

In [32]:
from sklearn.preprocessing import StandardScaler


scaled_data = data.copy()
no_scale_cols = ['app_id', 'title','win','mac','linux','steam_deck']
scale_cols = [col for col in scaled_data.columns if col not in no_scale_cols]



# Scale the data
scaler = StandardScaler()
scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])

print(scaled_data.head())

   date_release  win  mac  linux    rating  positive_ratio  user_reviews  \
0     -3.345559    1    0      0  1.180078        0.380639      0.009347   
1     -2.470099    1    0      0 -0.283818        0.435424     -0.045003   
2     -1.910159    1    1      1  1.180078        0.818914      0.047353   
3     -1.400881    1    0      0 -1.015766       -0.879399     -0.023742   
4     -1.420435    1    1      0  1.180078        0.599776      0.173672   

   price_final  price_original  discount  steam_deck  
0     0.118957        0.109779 -0.300552           1  
1    -0.488996       -0.498552 -0.300552           1  
2     0.553209        0.544300 -0.300552           1  
3     0.553209        0.544300 -0.300552           1  
4     0.292658        0.283587 -0.300552           1  


## 테스트 데이터 분리

In [33]:
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
# train_x, test_x = train_test_split(scaled_data)

# KNN 학습

 최종적으로는 knn결과(distances, indices를 csv형식으로 저장하여 학습 절차를 skip한다)
 
 distances = k개의 이웃간의 거리, indices = k개의 이웃 인덱스

In [93]:
from sklearn.neighbors import NearestNeighbors
import joblib

knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine')
knn.fit(scaled_data)

distances, indices = knn.kneighbors(scaled_data, n_neighbors=3)

In [99]:
save_distances = pd.concat([pd.DataFrame(ids), pd.DataFrame(distances)], axis=1)
indices_mapping = pd.DataFrame(indices)
indices_mapping = indices_mapping.applymap(lambda x: ids[x])
save_indices = pd.concat([pd.DataFrame(ids), indices_mapping], axis=1)
print(save_indices.head())

joblib.dump(knn, './models/knn_model.pkl')
joblib.dump(save_distances, './models/knn_distances.pkl')
joblib.dump(save_indices, './models/knn_indices.pkl')

   app_id       0       1       2
0   13500   13500   13600   12320
1   22364   22364   44650   42964
2  113020  113020  231160  210770
3  226560  226560  315151  303470
4  249050  249050  313160  320430


['./models/knn_indices.pkl']

## Recommendation 데이터셋 기반 게임 추천

In [100]:
# Load dataset
recommendations = pd.read_csv('../datasets/pre_recommendations.csv')

print(recommendations.head())
print(recommendations.shape)
print(recommendations.columns)
print(recommendations.dtypes)
print(recommendations.describe())

    app_id  helpful  funny        date  is_recommended  hours  user_id  \
0   975370        0      0  2022-12-12               1   36.3    49618   
1   304390        4      0  2017-02-17               0   11.5     2482   
2  1085660        2      0  2019-11-17               1  336.5   243365   
3   703080        0      0  2022-09-23               1   27.4   248653   
4   526870        0      0  2021-01-10               1    7.9    22898   

   review_id  
0          0  
1          1  
2          2  
3          3  
4          4  
(38347614, 8)
Index(['app_id', 'helpful', 'funny', 'date', 'is_recommended', 'hours',
       'user_id', 'review_id'],
      dtype='object')
app_id              int64
helpful             int64
funny               int64
date               object
is_recommended      int64
hours             float64
user_id             int64
review_id           int64
dtype: object
             app_id       helpful         funny  is_recommended         hours  \
count  3.834761e+07  3

In [102]:
# Load the trained model
loaded_distances = pd.DataFrame(joblib.load('./models/knn_distances.pkl'))
loaded_indices = pd.DataFrame(joblib.load('./models/knn_indices.pkl'))

print(loaded_distances.head())
print(loaded_indices.head())

# 10번 유저 게임 추천

# 10번 유저의 추천게임 추출
print("liked games")
rec_list = recommendations.loc[(recommendations['user_id'] == 10) & (recommendations['is_recommended'] == 1)]
print(rec_list)

# 10번 유저의 추천 게임과 유사한 K개 게임 추출
merged_idices = pd.merge(rec_list, loaded_indices, on='app_id')[[1, 2]]
merged_idices = merged_idices.drop_duplicates()
merged_idices = merged_idices.values.flatten().tolist()
print(merged_idices)

merged_distances = pd.merge(rec_list, loaded_distances, on='app_id')[[1, 2]]
merged_distances = merged_distances.drop_duplicates()
merged_distances = merged_distances.values.flatten().tolist()
print(merged_distances)

# 거리순으로 정렬
sorted_list = pd.DataFrame({'distance': merged_distances, 'app_id': merged_idices}).sort_values(by='distance', ascending=True)
print(sorted_list)

recommend_result = pd.merge(id_title, sorted_list, on="app_id", how="inner")
print(recommend_result)

   app_id             0         1         2
0   13500  0.000000e+00  0.000106  0.000128
1   22364  2.220446e-16  0.000144  0.001240
2  113020  0.000000e+00  0.000214  0.000361
3  226560  2.220446e-16  0.000518  0.000962
4  249050  0.000000e+00  0.002976  0.004046
   app_id       0       1       2
0   13500   13500   13600   12320
1   22364   22364   44650   42964
2  113020  113020  231160  210770
3  226560  226560  315151  303470
4  249050  249050  313160  320430
liked games
          app_id  helpful  funny        date  is_recommended  hours  user_id  \
22580757  289070        0      0  2021-03-29               1   80.3       10   
23134026  644930        0      0  2021-01-09               1  280.9       10   
25761899  323190        0      0  2022-01-06               1  135.2       10   

          review_id  
22580757   22580757  
23134026   23134026  
25761899   25761899  
[322330, 367520, 994280, 637650, 613100, 632470]
[0.014027212077069584, 0.015881691287017552, 0.001389087764586

## Evaluate

- recommendation에서 유저의 n%를 랜덤으로 제외
- 나머지는 학습에 사용
- 제외된 유저에서 n%의 데이터로 추천을 받는다.
- 추천받은 데이터와 실제 데이터간 자카드 유사도 공식 계산
