# Steam Games Success Prediction

Analyzing Steam games data to predict success based on pre-launch features.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import joblib
from datetime import datetime

## 2. Load Data

In [None]:
df = pd.read_csv('./games.csv')
df.head()

## 3. Fix Column Names

The CSV has a misalignment issue - `DiscountDLC count` should be two separate columns.

In [None]:
df.columns = ['Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'Discount', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies']

## 4. Data Cleaning

In [None]:
# Remove games with 0 owners (playtests, removed games, old data)
df = df[df['Estimated owners'] != '0 - 0']
print(f"Dataset: {len(df):,} games")

In [None]:
# Extract release year
df['Release date'] = pd.to_datetime(df['Release date'], errors='coerce')
df['Release year'] = df['Release date'].dt.year

In [None]:
# Convert estimated owners to numeric (using midpoint)
def convert_owners(owner_range):
    if pd.isna(owner_range):
        return np.nan
    parts = owner_range.replace(',', '').split(' - ')
    if len(parts) == 2:
        return (float(parts[0]) + float(parts[1])) / 2
    return np.nan

df['Estimated owners numeric'] = df['Estimated owners'].apply(convert_owners)

In [None]:
# Convert price to numeric
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

In [None]:
# Create support flags
df['has_website'] = df['Website'].notna().astype(int)
df['has_support_url'] = df['Support url'].notna().astype(int)
df['has_support_email'] = df['Support email'].notna().astype(int)

## 5. Create Success Score

Multi-dimensional success metric combining:
- Quality (40%): Positive review ratio
- Reach (30%): Normalized owners
- Engagement (20%): Normalized playtime
- Recency (10%): Normalized release year

In [None]:
scaler = MinMaxScaler()

# Quality: positive ratio
df['positive_ratio'] = df['Positive'] / (df['Positive'] + df['Negative'] + 1)

# Reach: log-scaled owners
mask = df['Estimated owners numeric'].notna()
df.loc[mask, 'estimated_owners_norm'] = scaler.fit_transform(
    np.log1p(df.loc[mask, 'Estimated owners numeric']).values.reshape(-1, 1)
).flatten()

# Engagement: playtime
mask = df['Average playtime forever'].notna()
df.loc[mask, 'playtime_norm'] = scaler.fit_transform(
    df.loc[mask, 'Average playtime forever'].values.reshape(-1, 1)
).flatten()

# Recency: release year
mask = df['Release year'].notna()
df.loc[mask, 'release_year_norm'] = scaler.fit_transform(
    df.loc[mask, 'Release year'].values.reshape(-1, 1)
).flatten()

# Combined score
required_cols = ['positive_ratio', 'estimated_owners_norm', 'playtime_norm', 'release_year_norm']
complete_mask = df[required_cols].notna().all(axis=1)

df.loc[complete_mask, 'success_score'] = (
    0.4 * df.loc[complete_mask, 'positive_ratio'] +
    0.3 * df.loc[complete_mask, 'estimated_owners_norm'] +
    0.2 * df.loc[complete_mask, 'playtime_norm'] +
    0.1 * df.loc[complete_mask, 'release_year_norm']
)

print(df['success_score'].describe())

In [None]:
# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['success_score'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(df['success_score'].mean(), color='red', linestyle='--', label=f'Mean: {df["success_score"].mean():.3f}')
axes[0].axvline(df['success_score'].median(), color='green', linestyle='--', label=f'Median: {df["success_score"].median():.3f}')
axes[0].set_xlabel('Success Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Success Score Distribution')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].boxplot(df['success_score'].dropna(), vert=True)
axes[1].set_ylabel('Success Score')
axes[1].set_title('Success Score Box Plot')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Create Balanced Success Categories

Using percentile-based thresholds for better class distribution:
- Low: < 0.25
- Medium: 0.25 - 0.43
- High: â‰¥ 0.43

In [None]:
def categorize_success(score):
    if pd.isna(score):
        return None
    elif score < 0.25:
        return 'Low'
    elif score < 0.43:
        return 'Medium'
    else:
        return 'High'

df['Success_Category_Balanced'] = df['success_score'].apply(categorize_success)

print("Distribution:")
print(df['Success_Category_Balanced'].value_counts())
print("\nPercentages:")
print(df['Success_Category_Balanced'].value_counts(normalize=True) * 100)

## 7. Select Features and Clean Data

In [None]:
# Keep only relevant columns
relevant_columns = [
    'success_score', 'Success_Category_Balanced',
    'Estimated owners numeric', 'Price', 'Required age', 'DLC count', 'Achievements', 'Release year',
    'Developers', 'Publishers', 'Genres', 'Categories', 'Tags', 'Supported languages',
    'Windows', 'Mac', 'Linux',
    'has_website', 'has_support_url', 'has_support_email',
    'Name'
]

df_selected = df[relevant_columns].copy()
print(f"Shape: {df_selected.shape}")

In [None]:
# Handle missing values
text_columns = ['Tags', 'Categories', 'Genres', 'Supported languages', 'Developers', 'Publishers']
for col in text_columns:
    df_selected[col] = df_selected[col].fillna('Unknown')

# Drop rows with missing critical data
df_selected = df_selected.dropna(subset=['Release year', 'success_score', 'Name'])
print(f"Clean dataset: {df_selected.shape}")

---
# Exploratory Data Analysis

## 8. Genre Analysis

In [None]:
# Count genres
genre_dict = {}
for genres, number in df_selected['Genres'].value_counts().items():
    if genres != 'Unknown':
        for genre in genres.split(','):
            genre = genre.strip()
            genre_dict[genre] = genre_dict.get(genre, 0) + number

overall_genre_counts = pd.DataFrame.from_dict(genre_dict, orient='index', columns=['Count'])
overall_genre_counts = overall_genre_counts.sort_values('Count', ascending=False)

print("Top 15 genres:")
print(overall_genre_counts.head(15))

genre_fraction = overall_genre_counts / len(df_selected)
top_12_genres = genre_fraction.sort_values('Count', ascending=False)[:12].index

In [None]:
# Genre distribution by success
genre_fraction_per_category = pd.DataFrame()

for category in ['Low', 'Medium', 'High']:
    category_df = df_selected[df_selected['Success_Category_Balanced'] == category]
    category_count = len(category_df)
    
    category_genre_dict = {}
    for genres, number in category_df['Genres'].value_counts().items():
        if genres != 'Unknown':
            for genre in genres.split(','):
                genre = genre.strip()
                category_genre_dict[genre] = category_genre_dict.get(genre, 0) + number
    
    category_genre_counts = pd.DataFrame.from_dict(category_genre_dict, orient='index', columns=[category])
    category_genre_fraction = category_genre_counts / category_count
    genre_fraction_per_category = pd.concat([genre_fraction_per_category, category_genre_fraction], axis=1)

genre_fraction_per_category = genre_fraction_per_category.fillna(0)
top_12_data = (genre_fraction_per_category.loc[top_12_genres] * 100)

print("Genre distribution (%) across success:")
top_12_data.style.background_gradient(cmap='Blues', axis=None).format("{:.2f}")