In [96]:
import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm.asyncio import tqdm
import aiohttp
import os



In [97]:
def clean_text(s) :
    new_str = s.lower()
    new_str = new_str.translate(str.maketrans('', '', string.punctuation))
    new_str = new_str.translate(str.maketrans('', '', string.digits))
    return new_str.strip()

def clean_names(df) :
    def clean_name(row):
        return clean_text(row['name'])
    new_df = df.copy()
    new_df['name'] = new_df.apply(clean_name,axis=1)
    return new_df
    

def clean_description(df):
    def clean_desc(row):
        desc = row['short_description']
        if isinstance(desc, str):
            return clean_text(desc)
        else:
            return ''
    def clean_detailed_desc(row):
        desc = row['detailed_description']
        if isinstance(desc, str):
            return clean_text(desc)
        else:
            return ''
    new_df = df.copy()
    new_df['short_description'] = new_df.apply(clean_desc, axis=1)
    new_df['detailed_description'] = new_df.apply(clean_detailed_desc, axis=1)
    return new_df
def parse_and_clean_tags(tag_string):
    """
    Parses a comma-separated string of tags and cleans each one.
    """
    if not isinstance(tag_string, str):
        return []
    
    # Split the string into individual tags by the comma
    tags = tag_string.split(',')
    
    cleaned_tags = []
    for tag in tags:
        # Now clean each individual tag
        tag = tag.lower()
        tag = tag.translate(str.maketrans('', '', string.punctuation + string.digits))
        tag = tag.strip()
        if tag: # Make sure the tag isn't empty
            cleaned_tags.append(tag)
            
    return cleaned_tags


In [None]:

steam = pd.read_csv("../data/updated_steam_games.csv")
model = SentenceTransformer('all-MiniLM-L6-v2')
steam = clean_names(steam)
steam = clean_description(steam)

#steam = steam.head(5000)
steam = steam[steam['pct_pos_total'] > -1]

all_tags = set()
for tag_string in steam['tags'].dropna():
    tags = parse_and_clean_tags(tag_string)
    all_tags.update(tags)

unique_tags_list = list(all_tags)


steam['combined_text'] = steam['short_description'] + ' ' + steam['tags'] + ' ' + steam['detailed_description']

transformed_output = model.encode(steam['combined_text'].tolist(), show_progress_bar=True)

tag_embeddings = model.encode(unique_tags_list, show_progress_bar=True)







Batches:  36%|███▌      | 602/1663 [06:07<09:46,  1.81it/s]

In [None]:
vectorizer = TfidfVectorizer()

# 2. Fit it on your reference corpus (all the tags) and transform them.
# This is now your reference matrix of embeddings.
tag_embeddings = vectorizer.fit_transform(unique_tags_list)

In [None]:
new_desc = "auto battler"
new_desc = clean_text(new_desc)
new_vector = model.encode([new_desc])



similarity_scores = cosine_similarity(new_vector,transformed_output)

top_5_indices = np.argsort(similarity_scores[0])[-5:][::-1]


# 7. Print the Top 5 Games and their Scores
print("Top 5 closest games:")
for i in top_5_indices:
  game_name = steam.iloc[i]['name']
  score = similarity_scores[0][i]
  print(f"- {game_name}: {score:.4f}")
  
closest_match_similarity = np.max(similarity_scores[0])
uniqueness_score = 1 - closest_match_similarity
print("Unique score:", uniqueness_score)

Top 5 closest games:
- autoheroes: 0.6261
- auto chess: 0.6187
- behold battle: 0.6037
- tiny battles: 0.6037
- auto riskrisk: 0.5962
Unique score: 0.37391865


In [None]:
def get_tags_for_description(description, tags_list, tag_embeddings, vectorizer, top_n=5):
    """
    Finds the most relevant tags for a given description.
    The vectorizer should already be fitted on the tag corpus.
    """
    # 1. Use the PRE-FITTED vectorizer to transform the new description.
    #    Note: We use .transform(), NOT .fit_transform()
    description_embedding = vectorizer.transform([description])

    # 2. Calculate cosine similarity between the description and all tags
    sim_scores = cosine_similarity(description_embedding, tag_embeddings)

    # 3. Get the top N scores and their indices
    top_indices = np.argsort(sim_scores[0])[-top_n:][::-1]

    # 4. Get the corresponding tags and their scores
    top_tags = [(tags_list[i], sim_scores[0][i]) for i in top_indices]
    
    return top_tags

# 3. Call the corrected function, passing in the fitted vectorizer
predicted_tags = get_tags_for_description(
    new_desc, 
    unique_tags_list, 
    tag_embeddings=tag_embeddings, 
    vectorizer=vectorizer, # Pass the fitted object in!
    top_n=3
)

print("Most similar tags for the description:")
for tag, score in predicted_tags:
    print(f"- {tag}: (Score: {score:.4f})")

Most similar tags for the description:
- auto battler: (Score: 1.0000)
- card battler: (Score: 0.4951)
- fps: (Score: 0.0000)


In [None]:
async def fetch_reviews_for_app(session, app_id, name):
    """Asynchronously fetches a page of reviews for a single app_id."""
    url = f"https://store.steampowered.com/appreviews/{app_id}?json=1&filter=recent&language=english"
    reviews_list = []
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            data = await response.json()
            if data.get("success") == 1 and "reviews" in data:
                for review_data in data["reviews"]:
                    reviews_list.append({
                        "appid": app_id,
                        "name": name,
                        "review_text": review_data.get("review", ""),
                        "recommended": review_data.get("voted_up", False)
                    })
            return reviews_list
    except Exception:
        # Return an empty list on error to avoid breaking the process
        return []
async def fetch_all_reviews(apps_to_fetch):
    """Creates and runs all asynchronous tasks for fetching review content."""
    print(f"Starting to fetch reviews for {len(apps_to_fetch)} games...")
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_reviews_for_app(session, app_id, name) for app_id, name in apps_to_fetch]
        
        # This will return a list of lists
        results_list_of_lists = await tqdm.gather(*tasks, desc="Fetching reviews")
        
        print("Finished fetching all reviews. Flattening results...")
        
        # Flatten the list of lists into a single list of review dictionaries
        all_reviews = [review for sublist in results_list_of_lists for review in sublist]
        return all_reviews


games_to_fetch = []
print("Queueing up the 5 most similar games for review fetching:")

for i in top_5_indices:
  # Make sure 'appid' and 'name' are the correct column names in your 'steam' DataFrame
  game_name = steam.iloc[i]['name']
  app_id = steam.iloc[i]['appid'] 
  
  score = similarity_scores[0][i]
  print(f"- {game_name} (ID: {app_id}), Score: {score:.4f}")
  
  games_to_fetch.append((app_id, game_name))

# 2. Call your existing function to fetch the reviews for just those 5 games
#    NOTE: This must be run in an async context (e.g., Jupyter or an async function)
top_5_reviews_data = await fetch_all_reviews(games_to_fetch)

# 3. Create a new DataFrame from the results
reviews_df = pd.DataFrame(top_5_reviews_data)

# 4. Display the results
print(f"\nSuccessfully fetched {len(reviews_df)} reviews for the top 5 games.")
print(reviews_df.head())

Queueing up the 5 most similar games for review fetching:
- autoheroes (ID: 2403940), Score: 0.6261
- auto chess (ID: 1530300), Score: 0.6187
- behold battle (ID: 2338910), Score: 0.6037
- tiny battles (ID: 2759230), Score: 0.6037
- auto riskrisk (ID: 2259990), Score: 0.5962
Starting to fetch reviews for 5 games...


NameError: name 'aiohttp' is not defined