In [5]:
import pandas as pd
import asyncio
import aiohttp
import os
from tqdm.asyncio import tqdm

# Load the original DataFrame
steam = pd.read_csv('../data/steam_games.csv')
steam = steam[steam['pct_pos_total'] > -1]

# --- CHANGES START HERE ---

async def fetch_reviews_for_app(session, app_id, name):
    """Asynchronously fetches a page of reviews for a single app_id."""
    url = f"https://store.steampowered.com/appreviews/{app_id}?json=1&filter=recent&language=english"
    reviews_list = []
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            data = await response.json()
            if data.get("success") == 1 and "reviews" in data:
                for review_data in data["reviews"]:
                    reviews_list.append({
                        "appid": app_id,
                        "name": name,
                        "review_text": review_data.get("review", ""),
                        "recommended": review_data.get("voted_up", False)
                    })
            return reviews_list
    except Exception:
        # Return an empty list on error to avoid breaking the process
        return []

async def fetch_all_reviews(apps_to_fetch):
    """Creates and runs all asynchronous tasks for fetching review content."""
    print(f"Starting to fetch reviews for {len(apps_to_fetch)} games...")
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_reviews_for_app(session, app_id, name) for app_id, name in apps_to_fetch]
        
        # This will return a list of lists
        results_list_of_lists = await tqdm.gather(*tasks, desc="Fetching reviews")
        
        print("Finished fetching all reviews. Flattening results...")
        
        # Flatten the list of lists into a single list of review dictionaries
        all_reviews = [review for sublist in results_list_of_lists for review in sublist]
        return all_reviews

# 1. Create a new DataFrame sorted by 'name' and then 'appid'
sorted_steam_df = steam.sort_values(by=['name', 'appid']).copy()

# 2. Get a list of (appid, name) tuples to process
apps_to_fetch = list(sorted_steam_df[['appid', 'name']].itertuples(index=False, name=None))
print(f"Processing the full set of {len(sorted_steam_df)} games.")

# 3. Run the asynchronous process to fetch the review content for all games
all_reviews_data = await fetch_all_reviews(apps_to_fetch)

# 4. Create the final DataFrame from the list of all fetched reviews
final_df = pd.DataFrame(all_reviews_data)

# 5. Define the directory and ensure it exists
output_dir = '../data'
os.makedirs(output_dir, exist_ok=True)

# 6. Define the full file path for the new reviews file
file_path = os.path.join(output_dir, 'steam_review_content.csv')

# 7. Export the final DataFrame to the new CSV file
final_df.to_csv(file_path, index=False)

# --- CHANGES END HERE ---

print(f"Final DataFrame with review content successfully saved to {file_path}")

# You can display the head of the new DataFrame to verify
print(f"\n--- First 5 rows of the new final ({len(final_df)} total reviews) DataFrame ---")
print(final_df.head())

Processing the full set of 53199 games.
Starting to fetch reviews for 53199 games...


Fetching reviews:   1%|          | 401/53199 [01:46<3:53:42,  3.77it/s] 


CancelledError: 