# Predict by KNN

- 게임 정보로 클러스터링(코사인 유사도 거리측정)
- 유저의 추천 게임과 거리 가장 가까운 거리의 게임 추천

## 알고리즘 순서
- 데이터셋 로드
- Scailing
- 테스트 데이터 분리
- KNN 학습 및 예측
- Evaluate


## 데이터셋 로드

games와 recommend 데이터셋을 결합하여 해당 게임을 추천했는지, 안했는지 여부로

In [75]:
# 데이터셋 로드 및 불필요한 column 제거
import numpy as np
import pandas as pd

# Load the data
data = pd.read_csv('../datasets/merged_steam_games.csv')

# convert date_realese columns to unixtime
data['release_date'] = pd.to_datetime(data['release_date'])
data['release_date'] = data['release_date'].apply(lambda x: int(x.timestamp()))

data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].apply(lambda x: int(x.timestamp()))

ids = pd.DataFrame(data['app_id'])
# data = data.drop(['app_id'], axis=1)

# 전처리 안된 column 제거
data = data.drop(['categories'], axis=1)

print(data.shape)
print(data.head())
print(data.columns)
print(data.dtypes)
print(data.describe())


(9764026, 54)
   app_id  helpful  funny        date  is_recommended  hours   user_id  win  \
0  304390        4      0  1619308800               0   42.5  10178125    1   
1  304390        0      0  1612310400               1  265.8   9545809    1   
2  304390        0      0  1604880000               0  493.3   2968763    1   
3  304390        0      0  1624579200               1  114.5  13598296    1   
4  304390        0      0  1617408000               0  226.0   6549932    1   

   mac  linux  ...  RPG  Indie  Software Training  Simulation  \
0    0      0  ...    0      0                  0           0   
1    0      0  ...    0      0                  0           0   
2    0      0  ...    0      0                  0           0   
3    0      0  ...    0      0                  0           0   
4    0      0  ...    0      0                  0           0   

   Game Development  Massively Multiplayer  Early Access  Nudity  Strategy  \
0                 0                      0

In [76]:
# 데이터셋에서 게임 데이터 분할
recommend_columns = ['app_id','helpful', 'funny', 'date', 'is_recommended', 'hours',
       'user_id']

recommend_datas = data[recommend_columns]

game_infos = data.drop(recommend_columns[1:], axis=1)

## Scailing

In [77]:
from sklearn.preprocessing import StandardScaler
scaled_data = game_infos.copy()
scale_cols = ['release_date','positive_ratio','user_reviews','price_final','price_original', 'required_age', 'achievements', 'positive_ratings', 'negative_ratings', 
              'average_playtime', 'median_playtime', 'owners', 'price',]
# scale_cols = [col for col in scaled_data.columns if col not in no_scale_cols]
unique_values = ids['app_id'].unique()
print(unique_values)

index_positions = [ids[ids['app_id'] == value].index[0] for value in unique_values]

scaled_data = game_infos.iloc[index_positions]

# Scale the data
scaler = StandardScaler()
scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])

[304390 306130 238960 ... 431700 563200 813650]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])


## 테스트 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
# train_x, test_x = train_test_split(scaled_data)

(5348,)


# KNN 학습

 최종적으로는 knn결과(distances, indices를 csv형식으로 저장하여 학습 절차를 skip한다)
 
 distances = k개의 이웃간의 거리, indices = k개의 이웃 인덱스

In [78]:
from sklearn.neighbors import NearestNeighbors
import joblib

knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine')
knn.fit(scaled_data)

distances, indices = knn.kneighbors(scaled_data, n_neighbors=3)

In [79]:
print(distances.shape)
print(indices.shape)
save_distances = pd.concat([pd.DataFrame(unique_values, columns=['app_id']), pd.DataFrame(distances)], axis=1)
indices_mapping = pd.DataFrame(indices)
indices_mapping = indices_mapping.applymap(lambda x: unique_values[x])
save_indices = pd.concat([pd.DataFrame(unique_values, columns=['app_id']), indices_mapping], axis=1)

joblib.dump(knn, './models/knn_model.pkl')
joblib.dump(save_distances, './models/knn_distances.pkl')
joblib.dump(save_indices, './models/knn_indices.pkl')

(5348, 3)
(5348, 3)


['./models/knn_indices.pkl']

## Recommendation 데이터 기반 게임 추천

In [80]:
# Load the trained model
loaded_distances = pd.DataFrame(joblib.load('./models/knn_distances.pkl'))
loaded_indices = pd.DataFrame(joblib.load('./models/knn_indices.pkl'))

title = pd.read_csv('../datasets/preprocessed_games.csv')
title = title[['app_id', 'title']]
id_title = pd.merge(pd.DataFrame(unique_values, columns=['app_id']), title, on='app_id', how='inner')

print(id_title.head())
print(loaded_distances.head())
print(loaded_indices.head())

   app_id                             title
0  304390                        FOR HONOR™
1  306130         The Elder Scrolls® Online
2  238960                     Path of Exile
3     730  Counter-Strike: Global Offensive
4  255710                  Cities: Skylines
   app_id             0             1             2
0  304390  0.000000e+00  3.998324e-11  4.324785e-11
1  306130  0.000000e+00  1.010529e-10  1.027296e-10
2  238960  1.110223e-16  6.388268e-11  7.097545e-11
3     730  0.000000e+00  3.244271e-03  5.115048e-03
4  255710  0.000000e+00  2.856348e-11  3.208545e-11
   app_id       0       1       2
0  304390  304390  278970  552500
1  306130  306130  335300  287700
2  238960  238960  200210  224260
3     730     730     570     440
4  255710  255710  281990  294100
liked games
         app_id  helpful  funny        date  is_recommended  hours  user_id
399948   289070        0      0  1616976000               1   80.3       10
6316135  644930        0      0  1610150400             

## Predict

In [101]:
KNN_recommend_result = pd.DataFrame(columns=['user_id', 'app_id', 'title', 'distance'])
# for user in recommend_datas['user_id'].unique():
for user in [10]:
    # 10번 유저 게임 추천
    rec_list = recommend_datas.loc[(recommend_datas['user_id'] == user) & (recommend_datas['is_recommended'] == 1)]

    # 10번 유저의 추천 게임과 유사한 K개 게임 추출
    merged_idices = pd.merge(rec_list, loaded_indices, on='app_id')[[1, 2]]
    merged_idices = merged_idices.values.flatten().tolist()

    merged_distances = pd.merge(rec_list, loaded_distances, on='app_id')[[1, 2]]
    merged_distances = merged_distances.values.flatten().tolist()

    recommend_result = pd.merge(id_title, pd.DataFrame({'distance': merged_distances, 'app_id': merged_idices}), on="app_id", how="inner")
    recommend_result = pd.concat([pd.DataFrame({'user_id' : [user] * recommend_result.shape[0]}), recommend_result], axis =1).sort_values(by='distance', ascending=True).iloc[:5]
    
    if recommend_datas.size != 0:
        KNN_recommend_result = pd.concat([KNN_recommend_result, recommend_result])
    print(KNN_recommend_result)
    
print(KNN_recommend_result)
# joblib.dump(KNN_recommend_result, './models/knn_recommend_result.pkl')

    user_id  app_id                                       title      distance
6  10178125  476460                               Picross Touch  3.971823e-12
2  10178125  632360                              Risk of Rain 2  8.101741e-12
1  10178125  570940                     DARK SOULS™: REMASTERED  8.294254e-12
7  10178125  568770  Cinderella Phenomenon - Otome/Visual Novel  9.019896e-12
4  10178125  371660                             Far Cry® Primal  1.252998e-11
    user_id  app_id                                       title      distance
6  10178125  476460                               Picross Touch  3.971823e-12
2  10178125  632360                              Risk of Rain 2  8.101741e-12
1  10178125  570940                     DARK SOULS™: REMASTERED  8.294254e-12
7  10178125  568770  Cinderella Phenomenon - Otome/Visual Novel  9.019896e-12
4  10178125  371660                             Far Cry® Primal  1.252998e-11
1   9545809  278970                               Digger Online 

KeyboardInterrupt: 

## Evaluate

- recommendation에서 유저의 n%를 랜덤으로 제외
- 나머지는 학습에 사용
- 제외된 유저에서 n%의 데이터로 추천을 받는다.
- 추천받은 데이터와 실제 데이터간 자카드 유사도 공식 계산
