# Merge Sales with Steam Dataset

## Preliminary

### Import Modules

In [1]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

### Set Options

In [2]:
pd.set_option('display.max_rows', 70) # display more rows
pd.set_option('display.max_columns', 50) # display more columns
pd.set_option('display.float_format', '{:.2f}'.format) # display numbers as decimals

In [3]:
def similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

## Load Data

In [4]:
df_merge1 = pd.read_pickle(r'../../data/df_merge1.pkl')
df_sales = pd.read_csv(r'../../data/vgchartz-2024.csv')

## Format Data

In [5]:
# format datetime
df_sales['release_date'] = pd.to_datetime(df_sales['release_date'])
df_merge1['release'] = pd.to_datetime(df_merge1['release'])
df_merge1['published_meta'] = pd.to_datetime(df_merge1['published_meta'])

# subset for pc games only
df_sales_pc = df_sales[df_sales['console'] == 'PC']

In [24]:
df_sales_pc.shape

(12617, 14)

In [31]:
df_sales['console'].unique()

array(['PS3', 'PS4', 'PS2', 'X360', 'XOne', 'PC', 'PSP', 'Wii', 'PS',
       'DS', '2600', 'GBA', 'NES', 'XB', 'PSN', 'GEN', 'PSV', 'DC', 'N64',
       'SAT', 'SNES', 'GBC', 'GC', 'NS', '3DS', 'GB', 'WiiU', 'WS', 'VC',
       'NG', 'WW', 'SCD', 'PCE', 'XBL', '3DO', 'GG', 'OSX', 'Mob', 'PCFX',
       'Series', 'All', 'iOS', '5200', 'And', 'DSiW', 'Lynx', 'Linux',
       'MS', 'ZXS', 'ACPC', 'Amig', '7800', 'DSi', 'AJ', 'WinP', 'iQue',
       'GIZ', 'VB', 'Ouya', 'NGage', 'AST', 'MSD', 'S32X', 'XS', 'PS5',
       'Int', 'CV', 'Arc', 'C64', 'FDS', 'MSX', 'OR', 'C128', 'CDi',
       'CD32', 'BRW', 'FMT', 'ApII', 'Aco', 'BBCM', 'TG16'], dtype=object)

## Examples

In [6]:
df_sales_pc.iloc[2103]

img             /games/boxart/default.jpg
title                        Kero Blaster
console                                PC
genre                              Action
publisher                    Studio Pixel
developer                    Studio Pixel
critic_score                          NaN
total_sales                           NaN
na_sales                              NaN
jp_sales                              NaN
pal_sales                             NaN
other_sales                           NaN
release_date          2014-05-11 00:00:00
last_update                    2018-08-20
Name: 21220, dtype: object

In [7]:
df_sales_pc[df_sales_pc['title'] == 'Kero Blaster'] # exact string is in df_merge1

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update
21220,/games/boxart/default.jpg,Kero Blaster,PC,Action,Studio Pixel,Studio Pixel,,,,,,,2014-05-11,2018-08-20


In [8]:
df_sales_pc[df_sales_pc['title'] == 'Anno 1800'] # exact string is in df_merge1

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update
63330,/games/boxart/full_4963434AmericaFrontccc.jpg,Anno 1800,PC,Strategy,Ubisoft,Blue Byte,,,,,,,2019-04-16,2018-08-12


In [9]:
df_sales_pc[df_sales_pc['title'] == 'Anno 2070'] # exact string is not in df_merge1

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update
918,/games/boxart/full_anno-2070_524PALFront.jpg,Anno 2070,PC,Strategy,Ubisoft,Blue Byte Studio,8.7,1.4,,,1.14,0.26,2011-11-17,


## Matching

In [30]:
lookup_index = 21220 # as example

lookup_name = df_sales_pc.loc[lookup_index, 'title']
lookup_publisher = df_sales_pc.loc[lookup_index, 'publisher']
lookup_developer = df_sales_pc.loc[lookup_index, 'developer']
lookup_release_date = df_sales_pc.loc[lookup_index, 'release_date']

similarity_array = np.array([])

print('--------------------------------------------------------------------------------------------------------------')
print(f'{lookup_name = }')
print(f'{lookup_publisher = }')
print(f'{lookup_release_date = }')
print('-------------------------------------------------------------------')

df_merge1_lookup = df_merge1[df_merge1['developer'] == lookup_developer]
#df_merge1_lookup = df_merge1[df_merge1['publisher'] == lookup_publisher]

if df_merge1_lookup.empty:
    print('No matching publisher found in df_merge1')
    print('--------------------------------------------------------------------------------------------------------------')
else:
    for game_entry in df_merge1_lookup.iterrows():
        compare_name = game_entry[1]['name']
        similarity_array = np.append(similarity_array, similar(lookup_name, compare_name))

    best_match_index = similarity_array.argsort()[-1] # sort by similarity and get index of highest similarity
    best_match_ratio = similarity_array[best_match_index] # get ratio of highest similarity
    best_match_release_timedelta = abs(lookup_release_date - df_merge1_lookup.iloc[best_match_index]['published_meta'])

    print(f'{best_match_index = }')
    print(f'{best_match_ratio = }')
    print(f'{df_merge1_lookup.iloc[best_match_index]["name"] = }')
    print(f'{df_merge1_lookup.iloc[best_match_index]["published_meta"] = }')
    print(f'{best_match_release_timedelta.days = }')
    print('-------------------------------------------------------------------')

    if best_match_ratio > 0.8 and best_match_release_timedelta.days < 10:
        match_found = True
    else:
        match_found = False

    print(f'{match_found = }')

--------------------------------------------------------------------------------------------------------------
lookup_name = 'Kero Blaster'
lookup_publisher = 'Studio Pixel'
lookup_release_date = Timestamp('2014-05-11 00:00:00')
-------------------------------------------------------------------
best_match_index = 0
best_match_ratio = 1.0
df_merge1_lookup.iloc[best_match_index]["name"] = 'Kero Blaster'
df_merge1_lookup.iloc[best_match_index]["published_meta"] = Timestamp('2014-05-11 00:00:00')
best_match_release_timedelta.days = 0
-------------------------------------------------------------------
match_found = True
