# Merging Different Datasets

## Preliminary

#### Import Modules

In [1]:
import pandas as pd

#### Set Options

In [2]:
pd.set_option('display.max_rows', 70) # display more rows
pd.set_option('display.max_columns', 50) # display more columns
pd.set_option('display.float_format', '{:.2f}'.format) # display numbers as decimals

## Load Data

In [3]:
df_steamdb = pd.read_csv(r'../../data/steamdb.csv', low_memory=False)
df_game_data = pd.read_csv(r'../../data/game_data_all.csv')

## Inspect Input Data

### Shape

In [4]:
print(f'{df_steamdb.shape = }')
print(f'{df_game_data.shape = }')

df_steamdb.shape = (53981, 46)
df_game_data.shape = (67571, 20)


### Columns

In [5]:
# standardize column names
df_steamdb = df_steamdb.rename(columns={'store_url': 'steam_url'})
df_game_data = df_game_data.rename(columns={'link': 'steam_url'})

In [6]:
display(list(df_steamdb.columns))

['sid',
 'steam_url',
 'store_promo_url',
 'store_uscore',
 'published_store',
 'published_meta',
 'published_stsp',
 'published_hltb',
 'published_igdb',
 'image',
 'name',
 'description',
 'full_price',
 'current_price',
 'discount',
 'platforms',
 'developers',
 'publishers',
 'languages',
 'voiceovers',
 'categories',
 'genres',
 'tags',
 'achievements',
 'gfq_url',
 'gfq_difficulty',
 'gfq_difficulty_comment',
 'gfq_rating',
 'gfq_rating_comment',
 'gfq_length',
 'gfq_length_comment',
 'stsp_owners',
 'stsp_mdntime',
 'hltb_url',
 'hltb_single',
 'hltb_complete',
 'meta_url',
 'meta_score',
 'meta_uscore',
 'grnk_score',
 'igdb_url',
 'igdb_single',
 'igdb_complete',
 'igdb_score',
 'igdb_uscore',
 'igdb_popularity']

In [7]:
display(list(df_game_data.columns))

['Unnamed: 0',
 'game',
 'steam_url',
 'release',
 'peak_players',
 'positive_reviews',
 'negative_reviews',
 'total_reviews',
 'rating',
 'primary_genre',
 'store_genres',
 'publisher',
 'developer',
 'detected_technologies',
 'store_asset_mod_time',
 'review_percentage',
 'players_right_now',
 '24_hour_peak',
 'all_time_peak',
 'all_time_peak_date']

### Missing Data

In [8]:
df_steamdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53981 entries, 0 to 53980
Data columns (total 46 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sid                     53981 non-null  int64  
 1   steam_url               53981 non-null  object 
 2   store_promo_url         7546 non-null   object 
 3   store_uscore            33462 non-null  float64
 4   published_store         53831 non-null  object 
 5   published_meta          33603 non-null  object 
 6   published_stsp          19616 non-null  object 
 7   published_hltb          25831 non-null  object 
 8   published_igdb          20965 non-null  object 
 9   image                   53981 non-null  object 
 10  name                    53981 non-null  object 
 11  description             53932 non-null  object 
 12  full_price              46817 non-null  float64
 13  current_price           46817 non-null  float64
 14  discount                6529 non-null 

In [9]:
df_game_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67571 entries, 0 to 67570
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             67571 non-null  int64  
 1   game                   67571 non-null  object 
 2   steam_url              67571 non-null  object 
 3   release                67571 non-null  object 
 4   peak_players           67571 non-null  int64  
 5   positive_reviews       67571 non-null  int64  
 6   negative_reviews       67571 non-null  int64  
 7   total_reviews          67571 non-null  int64  
 8   rating                 67571 non-null  float64
 9   primary_genre          67561 non-null  object 
 10  store_genres           67514 non-null  object 
 11  publisher              67110 non-null  object 
 12  developer              67443 non-null  object 
 13  detected_technologies  60265 non-null  object 
 14  store_asset_mod_time   67275 non-null  object 
 15  re

### Head

In [10]:
display(df_steamdb.head(3).T)

Unnamed: 0,0,1,2
sid,10,20,30
steam_url,https://store.steampowered.com/app/10,https://store.steampowered.com/app/20,https://store.steampowered.com/app/30
store_promo_url,https://www.youtube.com/watch?v=oKC9SAF4JAc,,https://www.youtube.com/watch?v=j4MCo89bTWE
store_uscore,97.00,84.00,90.00
published_store,2000-11-01,1999-04-01,2003-05-01
published_meta,2000-11-08,1999-04-07,2003-05-06
published_stsp,2000-11-01,1999-04-01,2003-05-01
published_hltb,1999-06-12,1999-04-07,2000-08-04
published_igdb,1999-06-12,1999-04-07,2003-05-01
image,https://steamcdn-a.akamaihd.net/steam/apps/10/...,https://steamcdn-a.akamaihd.net/steam/apps/20/...,https://steamcdn-a.akamaihd.net/steam/apps/30/...


In [11]:
display(df_game_data.head(3).T)

Unnamed: 0,0,1,2
Unnamed: 0,0,1,2
game,Pizza Tower,Resident Evil 4,The Murder of Sonic the Hedgehog
steam_url,/app/2231450/,/app/2050650/,/app/2324650/
release,2023-01-26,2023-03-24,2023-03-31
peak_players,4529,168191,15543
positive_reviews,19807,61752,12643
negative_reviews,227,1616,213
total_reviews,20034,63368,12856
rating,96.39,95.75,95.54
primary_genre,Action (1),Action (1),Casual (4)


## Merge

### Adjust Columns "steam_url"

In [12]:
# Are all entries in the "game_data" dataset column "steam_url" in a consistent format?
df_game_data['steam_url'].str.startswith(r'/app/').value_counts()

steam_url
True    67571
Name: count, dtype: int64

In [13]:
# Are all entries in the "df_steamdb" dataset column "steam_url" in a consistent format?
df_steamdb['steam_url'].str.startswith(r'https://store.steampowered.com/app/').value_counts()

steam_url
True    53981
Name: count, dtype: int64

In [14]:
# Adding the domain string to url
df_game_data['steam_url'] = 'https://store.steampowered.com' + df_game_data['steam_url']
df_game_data['steam_url'] = df_game_data['steam_url'].str.rstrip('/')

In [15]:
# sind alle Einträge in Dataset "game_data" Spalte "steam_url" in einem einheitlichen Format?
df_game_data['steam_url'].str.startswith(r'https://store.steampowered.com/app/').value_counts()

steam_url
True    67571
Name: count, dtype: int64

In [16]:
display(df_steamdb.head(3).T)

Unnamed: 0,0,1,2
sid,10,20,30
steam_url,https://store.steampowered.com/app/10,https://store.steampowered.com/app/20,https://store.steampowered.com/app/30
store_promo_url,https://www.youtube.com/watch?v=oKC9SAF4JAc,,https://www.youtube.com/watch?v=j4MCo89bTWE
store_uscore,97.00,84.00,90.00
published_store,2000-11-01,1999-04-01,2003-05-01
published_meta,2000-11-08,1999-04-07,2003-05-06
published_stsp,2000-11-01,1999-04-01,2003-05-01
published_hltb,1999-06-12,1999-04-07,2000-08-04
published_igdb,1999-06-12,1999-04-07,2003-05-01
image,https://steamcdn-a.akamaihd.net/steam/apps/10/...,https://steamcdn-a.akamaihd.net/steam/apps/20/...,https://steamcdn-a.akamaihd.net/steam/apps/30/...


In [17]:
display(df_game_data.head(3).T)

Unnamed: 0,0,1,2
Unnamed: 0,0,1,2
game,Pizza Tower,Resident Evil 4,The Murder of Sonic the Hedgehog
steam_url,https://store.steampowered.com/app/2231450,https://store.steampowered.com/app/2050650,https://store.steampowered.com/app/2324650
release,2023-01-26,2023-03-24,2023-03-31
peak_players,4529,168191,15543
positive_reviews,19807,61752,12643
negative_reviews,227,1616,213
total_reviews,20034,63368,12856
rating,96.39,95.75,95.54
primary_genre,Action (1),Action (1),Casual (4)


### Merge "steamdb" and "game_data_all"

In [18]:
df_merge1 = pd.merge(df_steamdb, df_game_data, on='steam_url', how='inner')

#### Inspect Merged Data

In [19]:
print(f'{df_merge1.shape = }')

df_merge1.shape = (52560, 65)


In [20]:
display(df_merge1.head(3).T)

Unnamed: 0,0,1,2
sid,360,380,400
steam_url,https://store.steampowered.com/app/360,https://store.steampowered.com/app/380,https://store.steampowered.com/app/400
store_promo_url,,https://www.youtube.com/watch?v=DL_mPw7KEU8,https://www.youtube.com/watch?v=nA9ChSA6wV4
store_uscore,76.00,95.00,98.00
published_store,2006-05-01,2006-06-01,2007-10-10
published_meta,2006-05-01,2006-06-01,2007-10-10
published_stsp,2006-05-01,2006-06-01,2007-10-10
published_hltb,2005-07-02,2006-06-01,2007-10-09
published_igdb,2006-05-01,2006-06-01,2007-10-09
image,https://steamcdn-a.akamaihd.net/steam/apps/360...,https://steamcdn-a.akamaihd.net/steam/apps/380...,https://steamcdn-a.akamaihd.net/steam/apps/400...


In [21]:
display(list(df_merge1.columns))

['sid',
 'steam_url',
 'store_promo_url',
 'store_uscore',
 'published_store',
 'published_meta',
 'published_stsp',
 'published_hltb',
 'published_igdb',
 'image',
 'name',
 'description',
 'full_price',
 'current_price',
 'discount',
 'platforms',
 'developers',
 'publishers',
 'languages',
 'voiceovers',
 'categories',
 'genres',
 'tags',
 'achievements',
 'gfq_url',
 'gfq_difficulty',
 'gfq_difficulty_comment',
 'gfq_rating',
 'gfq_rating_comment',
 'gfq_length',
 'gfq_length_comment',
 'stsp_owners',
 'stsp_mdntime',
 'hltb_url',
 'hltb_single',
 'hltb_complete',
 'meta_url',
 'meta_score',
 'meta_uscore',
 'grnk_score',
 'igdb_url',
 'igdb_single',
 'igdb_complete',
 'igdb_score',
 'igdb_uscore',
 'igdb_popularity',
 'Unnamed: 0',
 'game',
 'release',
 'peak_players',
 'positive_reviews',
 'negative_reviews',
 'total_reviews',
 'rating',
 'primary_genre',
 'store_genres',
 'publisher',
 'developer',
 'detected_technologies',
 'store_asset_mod_time',
 'review_percentage',
 'player

In [22]:
df_merge1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 65 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sid                     52560 non-null  int64  
 1   steam_url               52560 non-null  object 
 2   store_promo_url         7353 non-null   object 
 3   store_uscore            33350 non-null  float64
 4   published_store         52470 non-null  object 
 5   published_meta          32834 non-null  object 
 6   published_stsp          19209 non-null  object 
 7   published_hltb          25798 non-null  object 
 8   published_igdb          20666 non-null  object 
 9   image                   52560 non-null  object 
 10  name                    52560 non-null  object 
 11  description             52529 non-null  object 
 12  full_price              45460 non-null  float64
 13  current_price           45460 non-null  float64
 14  discount                6465 non-null 

In [23]:
display(df_merge1.head(3).T)

Unnamed: 0,0,1,2
sid,360,380,400
steam_url,https://store.steampowered.com/app/360,https://store.steampowered.com/app/380,https://store.steampowered.com/app/400
store_promo_url,,https://www.youtube.com/watch?v=DL_mPw7KEU8,https://www.youtube.com/watch?v=nA9ChSA6wV4
store_uscore,76.00,95.00,98.00
published_store,2006-05-01,2006-06-01,2007-10-10
published_meta,2006-05-01,2006-06-01,2007-10-10
published_stsp,2006-05-01,2006-06-01,2007-10-10
published_hltb,2005-07-02,2006-06-01,2007-10-09
published_igdb,2006-05-01,2006-06-01,2007-10-09
image,https://steamcdn-a.akamaihd.net/steam/apps/360...,https://steamcdn-a.akamaihd.net/steam/apps/380...,https://steamcdn-a.akamaihd.net/steam/apps/400...
