In [None]:
# Imports
from ast import literal_eval
from os import chdir, listdir
from os.path import exists

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import LocalCluster
from IPython.display import clear_output

In [None]:
# Instantiates dask cluster
cluster = LocalCluster()
client = cluster.get_client()
client

In [None]:
# Move up two directories, to project base directory
chdir("..\\..")

# Gets all files in store_info subfolder
folders = listdir("data\\reviews\\")

In [None]:
# Instantiates empty list to hold file paths
paths = []

# Gets paths to review JSONs. We only need one json from each folder, so we'll use the first.
for folder in listdir("data\\reviews\\"):
    current_game_first_json = "data\\reviews\\" + folder + "\\00000000.json"
    if exists(current_game_first_json):
        paths.append(current_game_first_json)

In [None]:
def extract_recommendations(row: pd.Series) -> pd.DataFrame:
    """Extracts and formats pandas DataFrame of recommendation summary from dask DataFrame.

    Args:
        row (pd.Series): Pandas Series corresponding to one json file.

    Returns:
        pd.DataFrame: Pandas DataFrame containing recommendation summary.
    """

    # Parses review information as a dictionary (from string)
    dict_recommendations = literal_eval(row["query_summary"])

    # Parses review information as a pandas DataFrame. Transposes so one record = one game.
    df_recommendations = pd.DataFrame.from_dict(
        dict_recommendations, orient="index"
    ).transpose()

    # Adds appid to review DataFrame
    df_recommendations["steam_appid"] = row["path"]

    # Coerces columns that should be ints to ints
    for int_col in [
        "num_reviews",
        "review_score",
        "total_positive",
        "total_negative",
        "total_reviews",
        "steam_appid",
    ]:
        df_recommendations[int_col] = df_recommendations[int_col].astype(int)

    # Sets index to app id column
    df_recommendations = df_recommendations.set_index("steam_appid")

    return df_recommendations

In [None]:
def ingest_recommendations(
    batch_paths: list, batch_index: int, batch_count: int, meta: pd.DataFrame
) -> dd.DataFrame:
    """Imports recommendation summaries from a batch of review JSONs.

    Args:
        batch_paths (list): List of paths to review JSONs. Should refer to some subset of all review JSONs.
        batch_index (int): Index of current batch (for progress reporting).
        batch_count (int): Total number of batches (for progress reporting).
        meta (pd.DataFrame): Empty pandas DataFrame dask uses as a template.

    Returns:
        dd.DataFrame: _description_
    """

    # Reads review JSONs into a dask DataFrame
    ddf_recommendation_jsons = dd.read_json(batch_paths, include_path_column=True)

    # Trims values of path column to just name of containing folder.
    # (Review download function uses steam app ids as folder names)
    ddf_recommendation_jsons["path"] = (
        ddf_recommendation_jsons["path"]
        .astype(str)
        .str[:-14]
        .replace(".+/", "", regex=True)
    )

    # Extracts recommendation summaries into dask DataFrame of dask DataFrames (using template)
    ddf_recommendations_dfs = ddf_recommendation_jsons.apply(
        extract_recommendations, axis=1, meta=meta
    )

    # Extracts dask DataFrames of recommendation summaries from dask DataFrame;
    #   Converts to pandas DataFrames; Concatenates into dask DataFrame
    ddf_recommendations = dd.concat(ddf_recommendations_dfs.compute().tolist())

    # Progress reporting.
    clear_output()
    print(f"Batch {batch_index+1}/{batch_count} processed!")
    
    return ddf_recommendations

In [None]:
# Instantiates some variables for batch processing step

# Imports template DataFrame so dask knows to to interpret things
meta = pd.read_parquet("dask_templates\\recommendations_meta.parquet")

# Sets batch size. 2^13 works pretty well with 64 GB of RAM;
#   You'll probably want to experiment if you have less.
#   I'd (naively) recommend 2^12 for 32 GB RAM, 2^11 for 16 GB RAM, etc.
batch_size = 2**13

# Calculates batch count for better progress reporting.
batch_count = len(paths) // batch_size

In [None]:
# Batch processing step

# Gets list of dask DataFrames of recommendation summaries
#   (I'm using a list comprehension to avoid having to name a bunch of DataFrames)
list_recommendation_ddfs = [
    ingest_recommendations(paths_subset.tolist(), batch_index, batch_count, meta)
    for batch_index, paths_subset in enumerate(
        np.array_split(paths, len(paths) / batch_size)
    )
]

In [None]:
# Concatenates dask DataFrames of recommendation summaries into one dask DataFrame
ddf_recommendations = dd.concat(list_recommendation_ddfs)

# Cleaning-fills nulls with zero.
ddf_recommendations = ddf_recommendations.fillna(0)

# Progress print
clear_output()
print(f"Recommendation ingest finished!\nWriting data to parquet...")

In [None]:
# Converts dask DataFrame to pandas DataFrame so that parquet is written as file (not folder)
df_recommendations = ddf_recommendations.compute()

# Writes parquet file
df_recommendations.to_parquet("data\\recommendations.parquet")

# Progress update
clear_output()
print(f"Parquet writing finished!")

# Terminates dask client
client.shutdown()
print("Cluster shutdown.")