In [244]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')


# Load the data
data = pd.read_csv('../datasets/merged_steam_games_.csv')

# convert date_realese columns to unixtime
data['date_release'] = pd.to_datetime(data['date_release'])
data['date_release'] = data['date_release'].apply(lambda x: int(x.timestamp()))

data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].apply(lambda x: int(x.timestamp()))

ids = pd.DataFrame(data['app_id'])
# data = data.drop(['app_id'], axis=1)

# 전처리 안된 column 제거
data = data.drop(['categories'], axis=1)

data = data.drop(['title'], axis=1)
data = data.drop(['release_date'], axis=1)  

print(data.shape)
print(data.head())
print(data.columns)
print(data.dtypes)
print(data.describe())


(46365, 56)
   app_id  date_release  win  mac  linux  rating  positive_ratio  \
0  552520    1522022400    1    0      0       7              80   
1  552520    1522022400    1    0      0       7              80   
2  552520    1522022400    1    0      0       7              80   
3  552520    1522022400    1    0      0       7              80   
4  552520    1522022400    1    0      0       7              80   

   user_reviews  price_final  price_original  ...  RPG  Indie  \
0        129943         60.0             0.0  ...    0      0   
1        129943         60.0             0.0  ...    0      0   
2        129943         60.0             0.0  ...    0      0   
3        129943         60.0             0.0  ...    0      0   
4        129943         60.0             0.0  ...    0      0   

   Software Training  Simulation  Game Development  Massively Multiplayer  \
0                  0           0                 0                      0   
1                  0           0  

In [245]:
# 데이터셋에서 게임 데이터 분할
recommend_columns = ['app_id','helpful', 'funny', 'date', 'is_recommended', 'hours',
       'user_id']

recommend_datas = data[recommend_columns]

game_infos = data.drop(recommend_columns[1:], axis=1)

In [246]:
app_datas = game_infos.copy()
scale_cols = ['date_release','positive_ratio','user_reviews','price_final','price_original', 'required_age', 'achievements', 'positive_ratings', 'negative_ratings', 
              'average_playtime', 'median_playtime', 'owners', 'price',]
# scale_cols = [col for col in scaled_data.columns if col not in no_scale_cols]
app_ids = ids['app_id'].unique()
app_ids = pd.DataFrame(app_ids, columns=['app_id'])
print(app_ids)

app_unique_indicies = [ids[ids['app_id'] == value].index[0] for value in app_ids.to_numpy().flatten()]

app_datas = game_infos.iloc[app_unique_indicies]

# Scale the data
scaler = StandardScaler()
app_datas[scale_cols] = scaler.fit_transform(app_datas[scale_cols])

    app_id
0   552520
1   242760
2   444200
3   255710
4      220
..     ...
69  427520
70  814380
71  945360
72  477160
73  250900

[74 rows x 1 columns]


In [247]:
# 스케일 된 게임 데이터와 추천 데이터 다시 합치기
merged_data = pd.merge(recommend_datas, app_datas, on='app_id')
print(merged_data.shape)

(46365, 56)


In [248]:
user_item_matrix = merged_data.pivot_table(index='user_id', columns='app_id', values='is_recommended', aggfunc='max', fill_value=0)
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head(5)

app_id,220,240,440,550,570,620,730,4000,12210,22380,...,578080,582010,594650,632360,646570,648800,812140,814380,945360,960090
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
491,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1183,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0,0
2710,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2765,1,1,0,1,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3214,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0


In [249]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarities = cosine_similarity(user_item_matrix)
user_similarities = pd.DataFrame(data = user_similarities, index = user_item_matrix.index, columns = user_item_matrix.index)
user_similarities

user_id,491,1183,2710,2765,3214,4446,15089,15619,16448,18794,...,13755432,13758387,13761904,13763873,13777901,13778085,13778487,13778506,13778726,13781520
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
491,1.000000,0.091287,0.223607,0.376889,0.000000,0.102062,0.111803,0.000000,0.176777,0.223607,...,0.426401,0.235702,0.000000,0.117851,0.125000,0.117851,0.250000,0.117851,0.000000,0.204124
1183,0.091287,1.000000,0.244949,0.385337,0.344265,0.298142,0.326599,0.000000,0.193649,0.081650,...,0.155700,0.344265,0.129099,0.172133,0.365148,0.086066,0.182574,0.086066,0.195180,0.223607
2710,0.223607,0.244949,1.000000,0.337100,0.000000,0.273861,0.300000,0.000000,0.237171,0.100000,...,0.095346,0.210819,0.000000,0.210819,0.000000,0.105409,0.000000,0.105409,0.119523,0.182574
2765,0.376889,0.385337,0.337100,1.000000,0.071067,0.246183,0.269680,0.150756,0.373101,0.471940,...,0.257130,0.213201,0.000000,0.284268,0.376889,0.071067,0.150756,0.071067,0.000000,0.246183
3214,0.000000,0.344265,0.000000,0.071067,1.000000,0.192450,0.105409,0.235702,0.083333,0.000000,...,0.000000,0.000000,0.166667,0.000000,0.235702,0.222222,0.117851,0.000000,0.125988,0.096225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13778085,0.117851,0.086066,0.105409,0.071067,0.222222,0.192450,0.000000,0.000000,0.166667,0.000000,...,0.201008,0.000000,0.166667,0.000000,0.117851,1.000000,0.117851,0.000000,0.251976,0.096225
13778487,0.250000,0.182574,0.000000,0.150756,0.117851,0.000000,0.223607,0.000000,0.088388,0.111803,...,0.426401,0.235702,0.000000,0.117851,0.250000,0.117851,1.000000,0.117851,0.133631,0.102062
13778506,0.117851,0.086066,0.105409,0.071067,0.000000,0.096225,0.105409,0.000000,0.250000,0.210819,...,0.201008,0.222222,0.166667,0.111111,0.117851,0.000000,0.117851,1.000000,0.000000,0.096225
13778726,0.000000,0.195180,0.119523,0.000000,0.125988,0.327327,0.119523,0.000000,0.188982,0.119523,...,0.113961,0.125988,0.377964,0.125988,0.000000,0.251976,0.133631,0.000000,1.000000,0.000000


In [250]:
def get_similar_users(user_id, k, user_similarities):
    # 해당 유저의 정보를 가져온다.
    user_row = user_similarities.loc[user_id,:]
    # 유사도가 높은 순으로 정렬한다.
    user_row_sorted = user_row.sort_values(ascending=False)
    # 유사도가 높은 유저 k명을 가져온다.
    most_similar_users = user_row_sorted[1:k+1]
    return most_similar_users

# DBSCAN으로 하면 모든 데이터가 Noise로 분류되니까
# consine 유사도같은걸로 거리 측정해서 K개의 데이터를 추출하고
# 추출된 데이터끼리 logistic regression으로 학습시키는 방법으로 해야할듯

# 1. 유저가 작성한 리뷰 정보(작성 유무, is_recommended의 값(0과 1))으로 DBSCAN을 사용하여 유저를 클러스터링 한다.
# 2. 클러스터링 된 유저의 리뷰 데이터를 통합한다.
# 3. 통합된 데이터의 is_recommended의 값이 0 또는 1중에서 한가지만 존재한다면 가장 가까운 클러스터와 클러스터를 합친다.
# 4. 2번 과정을 3번 조건이 되지 않을때까지 반복한다.
# 5. 통합된 데이터셋을 Y는 is_recommended, X는 user_id, app_id, Y 값을 제외한 모든 데이터로 하여 train, test데이터셋을 만든다.
# 6. 만들어진 train셋으로 logistic regression 모델을 만들고 test셋으로 evaluate한다.

In [251]:
def recommend(user_id):
    
    x_cols = app_datas.columns
    x_cols = x_cols.drop(['app_id'])
    
    user_review = merged_data[merged_data['user_id'] == user_id]
    reviewed_app_ids = user_review['app_id'].unique()
    
    k = 3
    while True:
        similar_user = get_similar_users(user_id, k, user_similarities)
        similar_user_reviews = merged_data[merged_data['user_id'].isin(similar_user.index)]

        X = similar_user_reviews[x_cols]
        Y = similar_user_reviews['is_recommended']
        
        if len(Y.unique()) == 1:
            k += 1
            continue
    
        break
    
    model = LogisticRegression()
    model.fit(X, Y)
    predict = model.predict_proba(app_datas[x_cols])
    predict = pd.DataFrame(predict, columns=['not_recommended', 'recommended'])
    predict = pd.concat([app_ids, predict], axis=1)
    sorted_predict = predict.sort_values(by='recommended', ascending=False)

    return sorted_predict[:5]

In [252]:
from sklearn.model_selection import train_test_split

user_game_count = pd.DataFrame(merged_data.groupby('user_id')['app_id'].count())
# print(user_game_count[user_game_count['user_id'] > 10].shape)

over_10_users = user_game_count[user_game_count['app_id'] > 10].index
under_10_users = user_game_count[user_game_count['app_id'] <= 10].index
print(over_10_users)

train_X = []
test_X = []

for user in merged_data['user_id'].unique():
    X = merged_data[merged_data['user_id'] == user]
    
    if len(X) <= 10:
        train_X.append(X)
        continue
    
    train, test = train_test_split(X,test_size=0.2)
    train_X.append(train)
    test_X.append(test)

train_X = pd.concat(train_X)
test_X = pd.concat(test_X)
    
print(train_X.shape)
print(test_X.shape)

# get_top_similar_games_for_user(7606333)

Int64Index([    1183,     2765,     4446,    15089,    16448,    48843,
               51559,    73191,    73274,    83283,
            ...
            13703989, 13708103, 13714056, 13715092, 13718408, 13720291,
            13735884, 13737779, 13755432, 13781520],
           dtype='int64', name='user_id', length=1417)
(41908, 56)
(4457, 56)


In [253]:
all_precisions = []
all_recalls = []
hitCount = 0

for user in over_10_users:
    user_test_X = test_X[test_X['user_id'] == user]
    actual_Y = pd.DataFrame(user_test_X[user_test_X['is_recommended'] == 1])
    
    predict_Y = recommend(user)
        
    if not actual_Y.empty and not predict_Y.empty:
        hit = len(set(actual_Y['app_id']) & set(predict_Y['app_id']))
        if hit > 0:
            hitCount += 1
        precision = hit / len(predict_Y['app_id'])
        recall = hit / len(actual_Y['app_id'])
        all_precisions.append(precision)
        all_recalls.append(recall)
            
average_precision = sum(all_precisions) / len(all_precisions)
average_recall = sum(all_recalls) / len(all_recalls)
HitRate = hitCount / len(over_10_users)

print("average_precision:", average_precision)
print("average_recall:", average_recall)
print("HitRate:", HitRate)

average_precision: 0.03756201275691013
average_recall: 0.06383179777935265
HitRate: 0.17572335920959775
