# Merge Sales with Steam Dataset

## Preliminary

### Import Modules

In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher

import sys
sys.path.append('../../') # add path of project root directory to sys.path so that project modules from utilities.py can be imported

from utilities import URLMerge

### Set Options

In [None]:
pd.set_option('display.max_rows', 70) # display more rows
pd.set_option('display.max_columns', 50) # display more columns
pd.set_option('display.float_format', '{:.2f}'.format) # display numbers as decimals

### Functions

In [None]:
def StringCompare(str1: str, str2: str, ratio: float=None):
    """
    Calculates the similarity ratio between two strings.

    Parameters:
    str1 (str): The first string to compare.
    str2 (str): The second string to compare.
    ratio (float, optional): The minimum similarity ratio required for the strings to be considered similar.
                                If None, the similarity ratio is returned without comparison to the specified ratio.

    Returns:
    bool: True if the similarity ratio between the strings is greater than or equal to the specified ratio,
          False otherwise.
    ratio (float): The similarity ratio between the strings if ratio is None.
    """
    if ratio == None:
        return SequenceMatcher(None, str1, str2).ratio()
    else:
        return SequenceMatcher(None, str1, str2).ratio() >= ratio

In [None]:
StringCompare('hello', 'henlo')

In [None]:
StringCompare('hello', 'henlo', 0.7)

In [None]:
StringCompare('hello', 'henlo', 0.9)

## Load Data

In [None]:
df_steamdb = pd.read_json(r'../../data/steamdb.json')
df_game_data = pd.read_csv(r'../../data/game_data_all.csv')
df_steam = URLMerge(df_steamdb, 'store_url', df_game_data, 'link')
df_sales = pd.read_csv(r'../../data/vgchartz-2024.csv')

In [None]:
df_steam

In [None]:
df_sales

## Format Data

### Set Datetime Types

In [None]:
# format datetime
df_sales['release_date'] = pd.to_datetime(df_sales['release_date'])

df_steam['release'] = pd.to_datetime(df_steam['release'])
df_steam['published_meta'] = pd.to_datetime(df_steam['published_meta'])
df_steam['published_stsp'] = pd.to_datetime(df_steam['published_stsp'])
df_steam['published_hltb'] = pd.to_datetime(df_steam['published_hltb'])
df_steam['published_igdb'] = pd.to_datetime(df_steam['published_igdb'])

## Inspect Data

### Sales Data

#### Console - Unique Values

In [None]:
df_sales['console'].unique()

#### Developers - Unique Values

In [None]:
list(df_sales['developer'].unique())

#### Columns

In [None]:
list(df_sales.columns)

### Steam Data

#### Columns

In [None]:
list(df_steam.columns)

## Filter Data

### Drop Columns

In [None]:
# df_steam.drop([], axis=1, inplace=True)

### Drop Missing Values

In [None]:
print(df_sales.shape)
df_sales.dropna(subset=['total_sales'], inplace=True)
print(df_sales.shape)

### Subset by Console

In [None]:
print(df_sales.shape)
mask_pc = df_sales['console'] == 'PC'
mask_pc += df_sales['console'] == 'OSX'
mask_pc += df_sales['console'] == 'Linux'
mask_pc
# subset for pc games only
#df_sales = df_sales[mask_pc]
print(df_sales.shape)

## Examples

### Example for multiple platform game

In [None]:
df_sales[df_sales['title'] == 'Battlefield 3']

In [None]:
df_sales[df_sales['title'] == 'Sea of Thieves']

### PC Games

In [None]:
df_sales.iloc[1000]

In [None]:
df_sales[df_sales['title'] == 'Kero Blaster'] # exact string is in df_merge1

In [None]:
df_sales[df_sales['title'] == 'Anno 1800'] # exact string is in df_merge1

In [None]:
df_sales[df_sales['title'] == 'Anno 2070'] # exact string is not in df_merge1

## Matching

In [None]:
lookup_index = 21220 # as example

lookup_name = df_sales.loc[lookup_index, 'title']
lookup_publisher = df_sales.loc[lookup_index, 'publisher']
lookup_developer = df_sales.loc[lookup_index, 'developer']
lookup_release_date = df_sales.loc[lookup_index, 'release_date']

similarity_array = np.array([])

print('--------------------------------------------------------------------------------------------------------------')
print(f'{lookup_name = }')
print(f'{lookup_publisher = }')
print(f'{lookup_release_date = }')
print('-------------------------------------------------------------------')

df_merge1_lookup = df_steam[df_steam['developer'] == lookup_developer]
#df_merge1_lookup = df_merge1[df_merge1['publisher'] == lookup_publisher]

if df_merge1_lookup.empty:
    print('No matching publisher found in df_merge1')
    print('--------------------------------------------------------------------------------------------------------------')
else:
    for game_entry in df_merge1_lookup.iterrows():
        compare_name = game_entry[1]['name']
        similarity_array = np.append(similarity_array, similar(lookup_name, compare_name))

    best_match_index = similarity_array.argsort()[-1] # sort by similarity and get index of highest similarity
    best_match_ratio = similarity_array[best_match_index] # get ratio of highest similarity
    best_match_release_timedelta = abs(lookup_release_date - df_merge1_lookup.iloc[best_match_index]['published_meta'])

    print(f'{best_match_index = }')
    print(f'{best_match_ratio = }')
    print(f'{df_merge1_lookup.iloc[best_match_index]["name"] = }')
    print(f'{df_merge1_lookup.iloc[best_match_index]["published_meta"] = }')
    print(f'{best_match_release_timedelta.days = }')
    print('-------------------------------------------------------------------')

    if best_match_ratio > 0.8 and best_match_release_timedelta.days < 10:
        match_found = True
    else:
        match_found = False

    print(f'{match_found = }')