# Predict by KNN

- 게임 정보로 클러스터링(코사인 유사도 거리측정)
- 유저의 추천 게임과 거리 가장 가까운 거리의 게임 추천

## 알고리즘 순서
- 데이터셋 로드
- Scailing
- 테스트 데이터 분리
- KNN 학습 및 예측
- Evaluate


## 데이터셋 로드

games와 recommend 데이터셋을 결합하여 해당 게임을 추천했는지, 안했는지 여부로

In [141]:
# 데이터셋 로드 및 불필요한 column 제거
import numpy as np
import pandas as pd

# Load the data
data = pd.read_csv('../datasets/merged_steam_games_.csv')

# convert date_realese columns to unixtime
data['date_release'] = pd.to_datetime(data['date_release'])
data['date_release'] = data['date_release'].apply(lambda x: int(x.timestamp()))

data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].apply(lambda x: int(x.timestamp()))

ids = pd.DataFrame(data['app_id'])
# data = data.drop(['app_id'], axis=1)

# 전처리 안된 column 제거
data = data.drop(['categories'], axis=1)

data = data.drop(['title'], axis=1)
data = data.drop(['release_date'], axis=1)  

print(data.shape)
print(data.head())
print(data.columns)
print(data.dtypes)
print(data.describe())


(46365, 56)
   app_id  date_release  win  mac  linux  rating  positive_ratio  \
0  552520    1522022400    1    0      0       7              80   
1  552520    1522022400    1    0      0       7              80   
2  552520    1522022400    1    0      0       7              80   
3  552520    1522022400    1    0      0       7              80   
4  552520    1522022400    1    0      0       7              80   

   user_reviews  price_final  price_original  ...  RPG  Indie  \
0        129943         60.0             0.0  ...    0      0   
1        129943         60.0             0.0  ...    0      0   
2        129943         60.0             0.0  ...    0      0   
3        129943         60.0             0.0  ...    0      0   
4        129943         60.0             0.0  ...    0      0   

   Software Training  Simulation  Game Development  Massively Multiplayer  \
0                  0           0                 0                      0   
1                  0           0  

In [142]:
# 데이터셋에서 게임 데이터 분할
recommend_columns = ['app_id','helpful', 'funny', 'date', 'is_recommended', 'hours',
       'user_id']

recommend_datas = data[recommend_columns]

game_infos = data.drop(recommend_columns[1:], axis=1)

## Scailing

In [143]:
from sklearn.preprocessing import StandardScaler
scaled_data = game_infos.copy()
scale_cols = ['date_release','positive_ratio','user_reviews','price_final','price_original', 'required_age', 'achievements', 'positive_ratings', 'negative_ratings', 
              'average_playtime', 'median_playtime', 'owners', 'price',]
# scale_cols = [col for col in scaled_data.columns if col not in no_scale_cols]
unique_values = ids['app_id'].unique()
print(unique_values)

index_positions = [ids[ids['app_id'] == value].index[0] for value in unique_values]

scaled_data = game_infos.iloc[index_positions]

# Scale the data
scaler = StandardScaler()
scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])

[552520 242760 444200 255710    220 594650 582010    550 271590 294100
 107410 292030 105600 438100 252490    570 377160  22380 239140 812140
 632360 221100 270880 251570 289070 238960 108600 548430 367520    730
 264710 646570 275850 230410 413150 359550 218620 435150    620 648800
 518790 444090 960090   4000 489830 381210 311210 374320  12210    240
    440  48700  49520 203160 227300 236390 262060 284160 291550 304930
 306130 319630 322330 346110 379720 391540 394360 578080 431960 427520
 814380 945360 477160 250900]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])


## 테스트 데이터 분리

In [144]:
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
# train_x, test_x = train_test_split(scaled_data)

# KNN 학습

 최종적으로는 knn결과(distances, indices를 csv형식으로 저장하여 학습 절차를 skip한다)
 
 distances = k개의 이웃간의 거리, indices = k개의 이웃 인덱스

In [145]:
from sklearn.neighbors import NearestNeighbors
import joblib

knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine')
knn.fit(scaled_data)

distances, indices = knn.kneighbors(scaled_data, n_neighbors=3)

In [146]:
print(distances.shape)
print(indices.shape)
save_distances = pd.concat([pd.DataFrame(unique_values, columns=['app_id']), pd.DataFrame(distances)], axis=1)
indices_mapping = pd.DataFrame(indices)
indices_mapping = indices_mapping.applymap(lambda x: unique_values[x])
save_indices = pd.concat([pd.DataFrame(unique_values, columns=['app_id']), indices_mapping], axis=1)

joblib.dump(knn, './models/knn_model.pkl')
joblib.dump(save_distances, './models/knn_distances.pkl')
joblib.dump(save_indices, './models/knn_indices.pkl')

(74, 3)
(74, 3)


['./models/knn_indices.pkl']

## Recommendation 데이터 기반 게임 추천

In [147]:
# Load the trained model
loaded_distances = pd.DataFrame(joblib.load('./models/knn_distances.pkl'))
loaded_indices = pd.DataFrame(joblib.load('./models/knn_indices.pkl'))

title = pd.read_csv('../datasets/preprocessed_games.csv')
title = title[['app_id', 'title']]
id_title = pd.merge(pd.DataFrame(unique_values, columns=['app_id']), title, on='app_id', how='inner')

print(id_title.head())
print(loaded_distances.head())
print(loaded_indices.head())

   app_id                 title
0  552520            Far Cry® 5
1  242760            The Forest
2  444200  World of Tanks Blitz
3  255710      Cities: Skylines
4     220           Half-Life 2
   app_id             0             1             2
0  552520  0.000000e+00  5.738877e-05  8.021221e-05
1  242760  0.000000e+00  6.910418e-07  5.164527e-06
2  444200  0.000000e+00  2.057032e-08  4.982731e-07
3  255710  1.110223e-16  2.115668e-08  2.283684e-06
4     220  3.330669e-16  1.094936e-10  2.882450e-09
   app_id       0       1       2
0  552520  552520  477160  646570
1  242760  242760  238960  582010
2  444200  444200  444090  646570
3  255710  255710  108600  270880
4     220     220     240     620


In [149]:
recommend_datas.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id
0,552520,7,0,1581120000,1,60.6,7606333
1,552520,2,0,1595203200,1,38.8,11805207
2,552520,0,0,1615680000,1,61.0,10684038
3,552520,63,13,1595721600,1,17.9,751598
4,552520,67,12,1601942400,1,301.5,5432359


## Predict

In [150]:
KNN_recommend_result = pd.DataFrame(columns=['user_id', 'app_id', 'title', 'distance'])
# for user in recommend_datas['user_id'].unique():
for user in [7606333]:
    # 10번 유저 게임 추천
    rec_list = recommend_datas.loc[(recommend_datas['user_id'] == user) & (recommend_datas['is_recommended'] == 1)]

    # 10번 유저의 추천 게임과 유사한 K개 게임 추출
    merged_idices = pd.merge(rec_list, loaded_indices, on='app_id')[[1, 2]]
    merged_idices = merged_idices.values.flatten().tolist()

    merged_distances = pd.merge(rec_list, loaded_distances, on='app_id')[[1, 2]]
    merged_distances = merged_distances.values.flatten().tolist()

    recommend_result = pd.merge(id_title, pd.DataFrame({'distance': merged_distances, 'app_id': merged_idices}), on="app_id", how="inner")
    recommend_result = pd.concat([pd.DataFrame({'user_id' : [user] * recommend_result.shape[0]}), recommend_result], axis =1).sort_values(by='distance', ascending=True).iloc[:5]
    
    if recommend_datas.size != 0:
        KNN_recommend_result = pd.concat([KNN_recommend_result, recommend_result])
    print(KNN_recommend_result)
    
# joblib.dump(KNN_recommend_result, './models/knn_recommend_result.pkl')

   user_id  app_id                            title      distance
9  7606333  306130        The Elder Scrolls® Online  6.147283e-08
6  7606333  381210                 Dead by Daylight  1.972394e-07
2  7606333  413150                   Stardew Valley  3.325829e-07
7  7606333  284160                     BeamNG.drive  3.946931e-07
3  7606333  359550  Tom Clancy's Rainbow Six® Siege  6.395513e-07
   user_id  app_id                            title      distance
9  7606333  306130        The Elder Scrolls® Online  6.147283e-08
6  7606333  381210                 Dead by Daylight  1.972394e-07
2  7606333  413150                   Stardew Valley  3.325829e-07
7  7606333  284160                     BeamNG.drive  3.946931e-07
3  7606333  359550  Tom Clancy's Rainbow Six® Siege  6.395513e-07


## Evaluate

- recommendation에서 유저의 n%를 랜덤으로 제외
- 나머지는 학습에 사용
- 제외된 유저에서 n%의 데이터로 추천을 받는다.
- 추천받은 데이터와 실제 데이터간 자카드 유사도 공식 계산
