In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

### 유저 데이터

In [42]:
users_df = pd.read_csv("../datasets/users.csv")

### 게임 데이터

In [43]:
games_df = pd.read_csv("../datasets/games.csv")

In [44]:
# object -> String으로 바꾸기
object_select = games_df.select_dtypes(include="object").columns
games_df[object_select] = games_df[object_select].astype('string')

In [45]:
# date_release -> datetime으로 바꾸기
games_df['date_release'] = pd.to_datetime(games_df['date_release'])

In [46]:
bool_column = ["win", "mac", "linux", "steam_deck"] # bool형 컬럼 1,0으로 교체
for column in bool_column:
    games_df[column] = games_df[column].replace({True:1, False:0})

In [47]:
# rating 컬럼 라벨 인코딩
rating_mapping = {
    'Overwhelmingly Positive': "8",
    'Very Positive': "7",
    'Mostly Positive': "6",
    'Positive': "5",
    'Mixed': "4",
    'Negative': "3",
    'Mostly Negative': "2",
    'Very Negative': "1",
    'Overwhelmingly Negative': "0"
}

# replace 메소드를 사용하여 문자열을 숫자로 변환
games_df['rating'] = games_df['rating'].replace(rating_mapping)

In [48]:
games_df.to_csv("../datasets/preprocessed_games.csv")

#### 추천 데이터

In [49]:
rc_df = pd.read_csv("../datasets/recommendations.csv")
rc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38347614 entries, 0 to 38347613
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7   review_id       int64  
dtypes: bool(1), float64(1), int64(5), object(1)
memory usage: 2.0+ GB


#### app_id: 제품 ID
#### helpful: 얼마나 많은 유저들이 helpful한 리뷰의 수
#### funny: 얼마나 많은 유저들이 funny한 리뷰의 수
#### date: 출시일
#### is_recommended: 사용자가 추천 했나 안했나
#### user_id: 유저 ID
#### review_id: 자동 생성 ID??

In [50]:
rc_df['date'] = rc_df["date"].astype("string")
rc_df["date"] = pd.to_datetime(rc_df["date"])

In [51]:
# True를 1, False를 0으로 교체
rc_df["is_recommended"] = rc_df["is_recommended"].replace({True:1, False:0})

In [52]:
rc_df.to_csv("../datasets/pre_recommendations.csv",index=False)

### 스팀 데이터

In [53]:
steam=pd.read_csv("../datasets/steam.csv")
steam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   appid             27075 non-null  int64  
 1   name              27075 non-null  object 
 2   release_date      27075 non-null  object 
 3   english           27075 non-null  int64  
 4   developer         27075 non-null  object 
 5   publisher         27075 non-null  object 
 6   platforms         27075 non-null  object 
 7   required_age      27075 non-null  int64  
 8   categories        27075 non-null  object 
 9   genres            27075 non-null  object 
 10  steamspy_tags     27075 non-null  object 
 11  achievements      27075 non-null  int64  
 12  positive_ratings  27075 non-null  int64  
 13  negative_ratings  27075 non-null  int64  
 14  average_playtime  27075 non-null  int64  
 15  median_playtime   27075 non-null  int64  
 16  owners            27075 non-null  object

#### appid: 아이디
#### name: 게임이름
#### release_date: 출시일
#### english: 영어지원여부
#### developer: 개발자(unique 값은 17113)
#### publisher: 출판자(unique 값은 14354)
#### platforms: 윈도우, 맥, 리눅스 등등 지원 여부(윈도우 68%, 윈,맥,리: 17% , 그외 나머지)
#### required_age: PEGI UK 기준에 따른 최소 필요 연령
#### categories: 세미콜론으로 구분된 게임 카테고리 single player, multi player,,
#### genres: 세미콜론으로 구분된 게임 장르
#### steamspy_tags: genres와 유사하지만 커뮤니티 투표로 장르?
#### achievements: 게임 내 성과의 수
#### positive_ratings: Number of positive ratings
#### negative_ratings: Number of negative ratings
#### average_playtime: 사용자 평균 플레이 시간
#### median_playtime: 사용자 중앙 플레이 시간 값
#### owners: 소유자 수 (0 ~ 20000: 69%, 20000 ~ 50000: 11%, Other: 20%)
#### price: 현재 가격

In [54]:
# release_Date -> datetime 변환
# developer, publisher, name는 제거했다가 나중에 필요할 때 사용하는 식
# platforms -> 원핫하거나
# categories -> 원핫
# genres -> int
# steamspy_tags -> 원핫
# owners -> 원핫
steam.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [55]:
steam["platforms"].value_counts()

windows              18398
windows;mac;linux     4623
windows;mac           3439
windows;linux          610
mac                      3
mac;linux                1
linux                    1
Name: platforms, dtype: int64

In [56]:
def one_hot_encode_multilabel(dataframe, column):
    # ;으로 나누기
    split_items = dataframe[column].str.split(';')
    
    # 각 고윳값 라벨
    unique_labels = set(item for sublist in split_items for item in sublist)

    # 각 라벨에 대해 열 생성
    for label in unique_labels:
        dataframe[label] = split_items.apply(lambda x: int(label in x))

    return dataframe.drop(column, axis=1)

def categorize_range(range_str):
    # 범위를 -를 기준으로 설정
    lower, upper = map(int, range_str.split('-'))
    if lower >= 0 and upper <= 20000:
        return 0
    elif lower >= 20000 and upper <= 50000:
        return 1
    else:
        return 2

In [57]:
columns = ["genres"]
# 각 컬럼에 대해 원핫 인코딩
for column in columns:
    steam = one_hot_encode_multilabel(steam, column)

In [58]:
steam['owners'] = steam['owners'].apply(categorize_range)

In [59]:
steam["release_date"] = pd.to_datetime(steam["release_date"])

In [60]:
steam.to_csv("../datasets/pre_steam.csv", index=False)

In [61]:
path = "../datasets/"
game_df = pd.read_csv(path+"preprocessed_games.csv", index_col=0)
pre_steam3 = pd.read_csv(path+"pre_steam.csv")
rc_df = pd.read_csv(path+"pre_recommendations.csv")

# 추천 데이터와 게임 데이터 합치기

In [62]:
merge_game_df = pd.merge(game_df, rc_df, on='app_id', how='inner')
merge_game_df['date'] = pd.to_datetime(merge_game_df['date'], errors='coerce')

In [63]:
# 게임 id와 유저 리뷰 개수 그룹핑
grouped_df = merge_game_df.groupby('app_id')['user_reviews'].sum().reset_index()

# 정렬
sorted_grouped_df = grouped_df.sort_values(by='user_reviews', ascending=False)

# 100개 선택
top_100_app_ids = sorted_grouped_df.head(100)['app_id']

top_100_filtered_df = merge_game_df[merge_game_df['app_id'].isin(top_100_app_ids)]

In [64]:
final_df = top_100_filtered_df[(top_100_filtered_df['hours'] > 12) & (top_100_filtered_df['date'].dt.year >= 2020)]

# 사용시간 12시간 이상 유저이면서 2020년 최근 리뷰, 리뷰 수가 10개 이상인 유저 필터링
user_id_counts = final_df['user_id'].value_counts()
user_ids_with_more_than_10 = user_id_counts[user_id_counts > 10].index
final_filtered_df = final_df[final_df['user_id'].isin(user_ids_with_more_than_10)]

#### 모델링에 영향이 크게 없는 name, developer, publisher, english, platforms, steamspy_tags들을 데이터 줄이기 위해 제거

In [65]:
pre_steam3.drop(["name", "developer", "publisher", "english", "platforms", "steamspy_tags"], axis=1, inplace=True)
pre_steam3.rename(columns={"appid": "app_id"}, inplace=True)

In [66]:
merged_steam_games = pd.merge(final_filtered_df, pre_steam3, on='app_id', how='inner')

In [67]:
merged_steam_games.to_csv("../datasets/merged_steam_games_.csv", index=False)

In [68]:
print("shape: ",merged_steam_games.shape)
print("game 수",merged_steam_games['app_id'].unique().shape)
print("user 수",merged_steam_games['user_id'].unique().shape)

shape:  (46365, 59)
game 수 (74,)
user 수 (4935,)
