# Predict by KNN

- 게임 정보로 클러스터링(코사인 유사도 거리측정)
- 유저의 추천 게임과 거리 가장 가까운 거리의 게임 추천

## 알고리즘 순서
- 데이터셋 로드
- Scailing
- 테스트 데이터 분리
- KNN 학습 및 예측
- Evaluate


## 데이터셋 로드

games와 recommend 데이터셋을 결합하여 해당 게임을 추천했는지, 안했는지 여부로

In [15]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Load the data
data = pd.read_csv('../datasets/merged_steam_games_.csv')

# convert date_realese columns to unixtime
data['date_release'] = pd.to_datetime(data['date_release'])
data['date_release'] = data['date_release'].apply(lambda x: int(x.timestamp()))

data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].apply(lambda x: int(x.timestamp()))

ids = pd.DataFrame(data['app_id'])
# data = data.drop(['app_id'], axis=1)

# 전처리 안된 column 제거
data = data.drop(['categories'], axis=1)

data = data.drop(['title'], axis=1)
data = data.drop(['release_date'], axis=1)  

print(data.shape)
print(data.head())
print(data.columns)
print(data.dtypes)
print(data.describe())


(46365, 56)
   app_id  date_release  win  mac  linux  rating  positive_ratio  \
0  552520    1522022400    1    0      0       7              80   
1  552520    1522022400    1    0      0       7              80   
2  552520    1522022400    1    0      0       7              80   
3  552520    1522022400    1    0      0       7              80   
4  552520    1522022400    1    0      0       7              80   

   user_reviews  price_final  price_original  ...  Video Production  \
0        129943         60.0             0.0  ...                 0   
1        129943         60.0             0.0  ...                 0   
2        129943         60.0             0.0  ...                 0   
3        129943         60.0             0.0  ...                 0   
4        129943         60.0             0.0  ...                 0   

   Accounting  Design & Illustration  Early Access  Audio Production  \
0           0                      0             0                 0   
1       

In [16]:
# Extract recommendation columns from dataset
recommend_columns = ['app_id','helpful', 'funny', 'date', 'is_recommended', 'hours',
       'user_id']

recommend_datas = data[recommend_columns]

game_infos = data.drop(recommend_columns[1:], axis=1)
scaled_data = game_infos.copy()
unique_values = ids['app_id'].unique()
index_positions = [ids[ids['app_id'] == value].index[0] for value in unique_values]
print(game_infos.iloc[index_positions])

       app_id  date_release  win  mac  linux  rating  positive_ratio  \
0      552520    1522022400    1    0      0       7              80   
815    242760    1525046400    1    0      0       8              95   
2093   444200    1478649600    1    1      0       7              80   
2187   255710    1425945600    1    1      1       7              93   
2725      220    1191974400    1    1      1       8              97   
...       ...           ...  ...  ...    ...     ...             ...   
42719  427520    1597363200    1    1      1       8              96   
43254  814380    1553126400    1    0      0       8              95   
43938  945360    1542326400    1    0      0       7              92   
45411  477160    1469145600    1    1      0       7              94   
45819  250900    1415059200    1    1      1       8              97   

       user_reviews  price_final  price_original  ...  Video Production  \
0            129943        60.00            0.00  ...       

## Scailing

In [17]:


scale_cols = ['date_release','positive_ratio','user_reviews','price_final','price_original', 'required_age', 'achievements', 'positive_ratings', 'negative_ratings', 
              'average_playtime', 'median_playtime', 'owners', 'price',]
# scale_cols = [col for col in scaled_data.columns if col not in no_scale_cols]


scaled_data = game_infos.iloc[index_positions]

# Scale the data
scaler = StandardScaler()
scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])

print(scaled_data.head())

      app_id  date_release  win  mac  linux  rating  positive_ratio  \
0     552520      0.729471    1    0      0       7       -1.339144   
815   242760      0.758106    1    0      0       8        0.611578   
2093  444200      0.318765    1    1      0       7       -1.339144   
2187  255710     -0.180300    1    1      1       7        0.351481   
2725     220     -2.395823    1    1      1       8        0.871674   

      user_reviews  price_final  price_original  ...  Video Production  \
0        -0.360355     2.494874       -0.228311  ...                 0   
815      -0.046625    -0.006719       -0.228311  ...                 0   
2093     -0.358830    -1.257515       -0.228311  ...                 0   
2187     -0.307168     0.618679       -0.228311  ...                 0   
2725     -0.368811    -0.632742        2.842675  ...                 0   

      Accounting  Design & Illustration  Early Access  Audio Production  \
0              0                      0             0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scaled_data[scale_cols] = scaler.fit_transform(scaled_data[scale_cols])


# KNN 학습

 최종적으로는 knn결과(distances, indices를 csv형식으로 저장하여 학습 절차를 skip한다)
 
 distances = k개의 이웃간의 거리, indices = k개의 이웃 인덱스

In [18]:
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')
knn.fit(scaled_data)

distances, indices = knn.kneighbors(scaled_data, n_neighbors=3)

In [19]:
distances.shape , indices.shape

((74, 3), (74, 3))

In [20]:
# import joblib
print(pd.DataFrame(distances).head())
print(pd.DataFrame(indices).head())
save_distances = pd.concat([pd.DataFrame(unique_values, columns=['app_id']), pd.DataFrame(distances)], axis=1)
indices_mapping = pd.DataFrame(indices)
indices_mapping = indices_mapping.applymap(lambda x: unique_values[x])
save_indices = pd.concat([pd.DataFrame(unique_values, columns=['app_id']), indices_mapping], axis=1)

# joblib.dump(knn, './models/knn_model.pkl')
# joblib.dump(save_distances, './models/knn_distances.pkl')
# joblib.dump(save_indices, './models/knn_indices.pkl')

              0             1             2
0  0.000000e+00  5.738877e-05  8.021221e-05
1  0.000000e+00  6.910418e-07  5.164527e-06
2  0.000000e+00  2.057032e-08  4.982731e-07
3  1.110223e-16  2.115668e-08  2.283684e-06
4  3.330669e-16  1.094936e-10  2.882450e-09
   0   1   2
0  0  72  31
1  1  25   6
2  2  41  31
3  3  26  22
4  4  49  38


## Recommendation 데이터 기반 게임 추천

In [21]:
# Load the trained model
# loaded_distances = pd.DataFrame(joblib.load('./models/knn_distances.pkl'))
# loaded_indices = pd.DataFrame(joblib.load('./models/knn_indices.pkl'))
loaded_distances = pd.DataFrame(save_distances.copy())
loaded_indices = pd.DataFrame(save_indices.copy())

title = pd.read_csv('../datasets/preprocessed_games.csv')
title = title[['app_id', 'title']]
id_title = pd.merge(pd.DataFrame(unique_values, columns=['app_id']), title, on='app_id', how='inner')

print(id_title.head())
print(loaded_distances.head())
print(loaded_indices.head())

   app_id                 title
0  552520            Far Cry® 5
1  242760            The Forest
2  444200  World of Tanks Blitz
3  255710      Cities: Skylines
4     220           Half-Life 2
   app_id             0             1             2
0  552520  0.000000e+00  5.738877e-05  8.021221e-05
1  242760  0.000000e+00  6.910418e-07  5.164527e-06
2  444200  0.000000e+00  2.057032e-08  4.982731e-07
3  255710  1.110223e-16  2.115668e-08  2.283684e-06
4     220  3.330669e-16  1.094936e-10  2.882450e-09
   app_id       0       1       2
0  552520  552520  477160  646570
1  242760  242760  238960  582010
2  444200  444200  444090  646570
3  255710  255710  108600  270880
4     220     220     240     620


In [22]:
recommend_datas

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id
0,552520,7,0,1581120000,1,60.6,7606333
1,552520,2,0,1595203200,1,38.8,11805207
2,552520,0,0,1615680000,1,61.0,10684038
3,552520,63,13,1595721600,1,17.9,751598
4,552520,67,12,1601942400,1,301.5,5432359
...,...,...,...,...,...,...,...
46360,250900,6,3,1593734400,1,202.7,3344637
46361,250900,0,2,1593129600,1,314.0,5881798
46362,250900,0,0,1593302400,1,240.0,4810432
46363,250900,0,0,1612224000,1,527.1,12841144


## 테스트 데이터 분리

In [23]:
data

Unnamed: 0,app_id,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,...,Video Production,Accounting,Design & Illustration,Early Access,Audio Production,Utilities,Nudity,Software Training,Action,Photo Editing
0,552520,1522022400,1,0,0,7,80,129943,60.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,552520,1522022400,1,0,0,7,80,129943,60.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,552520,1522022400,1,0,0,7,80,129943,60.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,552520,1522022400,1,0,0,7,80,129943,60.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,552520,1522022400,1,0,0,7,80,129943,60.0,0.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46360,250900,1415059200,1,1,1,8,97,225815,15.0,0.0,...,0,0,0,0,0,0,0,0,1,0
46361,250900,1415059200,1,1,1,8,97,225815,15.0,0.0,...,0,0,0,0,0,0,0,0,1,0
46362,250900,1415059200,1,1,1,8,97,225815,15.0,0.0,...,0,0,0,0,0,0,0,0,1,0
46363,250900,1415059200,1,1,1,8,97,225815,15.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [24]:


user_game_count = pd.DataFrame(data.groupby('user_id')['user_id', 'app_id'].count())
# print(user_game_count[user_game_count['user_id'] > 10].shape)

over_10_users = user_game_count[user_game_count['app_id'] > 10]['user_id']
under_10_users = user_game_count[user_game_count['app_id'] <= 10]['user_id']

train_X = []
test_X = []

for user in recommend_datas['user_id'].unique():
    X = recommend_datas[recommend_datas['user_id'] == user]
    
    if len(X) <= 10:
        train_X.append(X)
        continue
    
    # t_size = 5 / len(X) 
    # train, test = train_test_split(X,train_size= 1- t_size ,test_size=t_size)
    train, test = train_test_split(X,test_size=0.2)
    train_X.append(train)
    test_X.append(test)

train_X = pd.concat(train_X)
test_X = pd.concat(test_X)
    
print(train_X.shape)
print(test_X.shape)


# Split the data into train and test sets
# train_x, test_x = train_test_split(scaled_data)

  user_game_count = pd.DataFrame(data.groupby('user_id')['user_id', 'app_id'].count())


(41908, 7)
(4457, 7)


## Predict

In [25]:

# for user in recommend_datas['user_id'].unique():
def recommend(user):
    KNN_recommend_result = pd.DataFrame(columns=['user_id', 'app_id', 'title', 'distance'])
    # 10번 유저 게임 추천
    rec_list = train_X.loc[(train_X['user_id'] == user) & (train_X['is_recommended'] == 1)]

    # 10번 유저의 추천 게임과 유사한 K개 게임 추출
    merged_idices = pd.merge(rec_list, loaded_indices, on='app_id')[[1, 2]]
    merged_idices = merged_idices.values.flatten().tolist()

    merged_distances = pd.merge(rec_list, loaded_distances, on='app_id')[[1, 2]]
    merged_distances = merged_distances.values.flatten().tolist()

    recommend_result = pd.merge(id_title, pd.DataFrame({'distance': merged_distances, 'app_id': merged_idices}), on="app_id", how="inner")
    recommend_result = pd.concat([pd.DataFrame({'user_id' : [user] * recommend_result.shape[0]}), recommend_result], axis =1).sort_values(by='distance', ascending=True).iloc[:5]
    
    if train_X.size != 0:
        KNN_recommend_result = pd.concat([KNN_recommend_result, recommend_result])
    return recommend_result
    
# joblib.dump(KNN_recommend_result, './models/knn_recommend_result.pkl')

In [26]:
recommend(7606333)

Unnamed: 0,user_id,app_id,title,distance
9,7606333,306130,The Elder Scrolls® Online,6.147283e-08
6,7606333,381210,Dead by Daylight,1.972394e-07
2,7606333,413150,Stardew Valley,3.325829e-07
7,7606333,284160,BeamNG.drive,3.946931e-07
3,7606333,359550,Tom Clancy's Rainbow Six® Siege,6.395513e-07


## Evaluate

- recommendation에서 유저의 n%를 랜덤으로 제외
- 나머지는 학습에 사용
- 제외된 유저에서 n%의 데이터로 추천을 받는다.
- 추천받은 데이터와 실제 데이터간 자카드 유사도 공식 계산


In [27]:

# 10개 이상 유저, 아닌 유저 분할
user_game_count = pd.DataFrame(data.groupby('user_id')['app_id'].count())
# print(user_game_count[user_game_count['user_id'] > 10].shape)

over_10_users = user_game_count[user_game_count['app_id'] > 10].index
under_10_users = user_game_count[user_game_count['app_id'] <= 10].index
print(over_10_users)
# under_10_users = user_game_count[user_game_count['app_id'] <= 10]['user_id']

Int64Index([    1183,     2765,     4446,    15089,    16448,    48843,
               51559,    73191,    73274,    83283,
            ...
            13703989, 13708103, 13714056, 13715092, 13718408, 13720291,
            13735884, 13737779, 13755432, 13781520],
           dtype='int64', name='user_id', length=1417)


In [28]:


all_precisions = []
all_recalls = []
hitCount = 0

for user in over_10_users:
    user_test_X = test_X[test_X['user_id'] == user]
    actual_Y = user_test_X[user_test_X['is_recommended'] == 1]
    
    predict_Y = recommend(user)
    
    
    
    if not actual_Y.empty and not predict_Y.empty:
        hit = len(set(actual_Y['app_id']) & set(predict_Y['app_id']))
        if hit > 0:
            hitCount += 1
        precision = hit / len(predict_Y['app_id'])
        recall = hit / len(actual_Y['app_id'])
        all_precisions.append(precision)
        all_recalls.append(recall)
            
average_precision = sum(all_precisions) / len(all_precisions)
average_recall = sum(all_recalls) / len(all_recalls)
HitRate = hitCount / len(over_10_users)

print("average_precision:", average_precision)
print("average_recall:", average_recall)
print("HitRate:", HitRate)

average_precision: 0.047980155917789015
average_recall: 0.08158516418615627
HitRate: 0.21524347212420608
