In [None]:
# --- 1. Import Necessary Libraries ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import ast
import numpy as np
from matplotlib.ticker import ScalarFormatter
from sklearn.preprocessing import MultiLabelBinarizer

# --- 2. Set a Consistent Visual Style for Plots ---
sns.set_style("whitegrid")

# --- 3. Load the Dataset ---
# This block loads your main DataFrame. The try/except handles the error if the file isn't found.
try:
    steam_df = pd.read_csv('../data/updated_steam_games.csv')
except FileNotFoundError:
    print("Error: 'updated_steam_games.csv' not found. Please ensure the file is in the correct directory.")
    # In a real script, you might want to exit() here
    steam_df = pd.DataFrame() # Create an empty DataFrame to prevent further errors


# --- 4. Initial Data Cleaning and Parsing (from your notebook) ---

# Helper function to safely parse the string in the 'tags' column into a dictionary
def to_dict_if_str(v):
    """Safely evaluates a string that looks like a dictionary."""
    if isinstance(v, str) and v.strip():
        try:
            return ast.literal_eval(v)
        except (ValueError, SyntaxError):
            return {} # Return an empty dictionary if parsing fails
    return v if isinstance(v, dict) else {}

# Apply the function to the 'tags' column if the DataFrame loaded successfully
steam_df['tags'] = steam_df['tags'].apply(to_dict_if_str)
if not steam_df.empty:
    
    # Convert release_date to datetime objects for time-based analysis
    steam_df['release_date'] = pd.to_datetime(steam_df['release_date'])
    steam_df['release_month'] = steam_df['release_date'].dt.month

    print("Dataset loaded and initial parsing complete.")
    print(f"DataFrame contains {steam_df.shape[0]} rows and {steam_df.shape[1]} columns.")


# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the 'top3_tags' column
encoded_tags = mlb.fit_transform(steam_df['tags'])

# Create a new DataFrame with the encoded tags
# The `mlb.classes_` attribute holds the names of all unique tags
encoded_df = pd.DataFrame(encoded_tags, columns=mlb.classes_, index=steam_df.index)

# Concatenate the new encoded DataFrame with the original steam DataFrame
steam_encoded = pd.concat([steam_df, encoded_df], axis=1)

# Display the first few rows with the new encoded columns
steam_encoded['year'] = steam_encoded['release_date'].dt.year
steam_encoded['month'] = steam_encoded['release_date'].dt.month
steam_encoded['dow'] = steam_encoded['release_date'].dt.dayofweek
steam_encoded['season'] = steam_encoded['release_date'].dt.quarter

steam_encoded["during_summer_sale"]  = steam_encoded["release_date"].between(
    pd.to_datetime(steam_encoded["release_date"].dt.year.astype(str) + "-06-21"),
    pd.to_datetime(steam_encoded["release_date"].dt.year.astype(str) + "-07-07")
)
steam_encoded["during_winter_sale"]  = steam_encoded["release_date"].between(
    pd.to_datetime(steam_encoded["release_date"].dt.year.astype(str) + "-12-20"),
    pd.to_datetime(steam_encoded["release_date"].dt.year.astype(str) + "-01-04") + pd.offsets.YearEnd(0)
)

Dataset loaded and initial parsing complete.
DataFrame contains 89618 rows and 48 columns.
    appid                             name release_date  required_age  price  \
0     730                 Counter-Strike 2   2012-08-21             0   0.00   
1  578080              PUBG: BATTLEGROUNDS   2017-12-21             0   0.00   
2     570                           Dota 2   2013-07-09             0   0.00   
3  271590        Grand Theft Auto V Legacy   2015-04-13            17   0.00   
4  359550  Tom Clancy's Rainbow Six® Siege   2015-12-01            17   3.99   

   dlc_count                               detailed_description  \
0          1  For over two decades, Counter-Strike has offer...   
1          0  LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...   
2          2  The most-played game on Steam. Every day, mill...   
3          0  When a young street hustler, a retired bank ro...   
4          9  Edition Comparison Ultimate Edition The Tom Cl...   

                         

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report

# Assume 'steam_encoded' and 'encoded_df' are correctly created as before

# --- 1. Define feature and target columns ---
encoded_tag_columns = encoded_df.columns.tolist()
numeric_features = ['price']
passthrough_features = encoded_tag_columns

# --- 2. Create the Target Column ---
threshold = steam_encoded['peak_ccu'].quantile(0.9)
steam_encoded['is_successful'] = steam_encoded['peak_ccu'] >= threshold


y = steam_encoded['peak_ccu']
X = steam_encoded[numeric_features + passthrough_features]

train = steam_encoded[steam_encoded["year"] <= 2023]
test  = steam_encoded[steam_encoded["year"] >= 2024]

X_train = train.drop(columns=["success_metric", "is_top_quartile"])
y_train = train["success_metric"]           # or ["is_top_quartile"]

X_test  = test.drop(columns=["success_metric", "is_top_quartile"])
y_test  = test["success_metric"]
print(y.unique())
print(y.value_counts())
print(y_train.value_counts())

# You can add this line to verify the split worked
# print("Class distribution in y_train:\n", y_train.value_counts(normalize=True))


# --- 4. Build and Fit the Pipeline ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('pass', 'passthrough', passthrough_features)
    ])

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# This should now work without error
pipe.fit(X_train, y_train)


# --- 5. Evaluate the Model ---
y_pred = pipe.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("--- Accuracy ---")
print(f"Accuracy: {accuracy:.4f}\n")
print("--- Classification Report ---")
print(classification_report(y_test, y_pred))

[ True False]
is_successful
False    80655
True      8963
Name: count, dtype: int64
is_successful
False    64524
True      7170
Name: count, dtype: int64
--- Accuracy ---
Accuracy: 0.9128

--- Classification Report ---
              precision    recall  f1-score   support

       False       0.92      0.99      0.95     16131
        True       0.67      0.26      0.37      1793

    accuracy                           0.91     17924
   macro avg       0.79      0.62      0.66     17924
weighted avg       0.90      0.91      0.89     17924

