# Steam Store Data Preprocessing

This notebook cleans and preprocesses the Steam store data.

In [6]:
import pandas as pd
import numpy as np
import io


In [11]:

try:
    dff = pd.read_csv('steam.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'steam_store_data_2024.csv' not found. Please upload the file.")

Dataset loaded successfully.


In [16]:

print("Initial Data Info:")
print(df.info())
display(df.head())

Initial Data Info:


NameError: name 'df' is not defined

In [None]:

initial_rows = len(df)
df = df.drop_duplicates(subset=['title'])
print(f"Removed {initial_rows - len(df)} duplicate rows.")

Removed 11 duplicate rows.


In [None]:

def clean_price(price_str):
    if pd.isna(price_str) or price_str == '':
        return 0.0
    if isinstance(price_str, str):
        
        clean_str = price_str.replace('$', '').strip()
        try:
            return float(clean_str)
        except ValueError:
            return 0.0
    return float(price_str)

df['price_cleaned'] = df['price'].apply(clean_price)
print("Price column cleaned.")

Price column cleaned.


In [None]:

def clean_sale_percentage(sale_str):
    if pd.isna(sale_str) or sale_str == '':
        return 0.0
    if isinstance(sale_str, str):
        
        clean_str = sale_str.replace('%', '').replace('-', '').strip()
        try:
            return float(clean_str)
        except ValueError:
            return 0.0
    return float(sale_str)

df['salePercentage_cleaned'] = df['salePercentage'].apply(clean_sale_percentage)
print("Sale Percentage column cleaned.")

Sale Percentage column cleaned.


In [None]:

review_mapping = {
    "Overwhelmingly Positive": 95,
    "Very Positive": 85,
    "Mostly Positive": 70,
    "Mixed": 50,
    "Mostly Negative": 30,
    "Very Negative": 15,
    "Overwhelmingly Negative": 5
}

def map_review(review_str):
    if pd.isna(review_str):
        return -1  
    return review_mapping.get(review_str, -1)

df['recentReviews_score'] = df['recentReviews'].apply(map_review)
df['allReviews_score'] = df['allReviews'].apply(map_review)
print("Review columns mapped to scores.")

Review columns mapped to scores.


In [None]:

print("Processed Data Info:")
print(df[['title', 'price_cleaned', 'salePercentage_cleaned', 'recentReviews_score', 'allReviews_score']].head())

df.to_csv('steam_store_data_processed.csv', index=False)
print("Saved processed data to 'steam_store_data_processed.csv'.")

Processed Data Info:
                                               title  price_cleaned  \
0                      Ori and the Will of the Wisps           9.89   
1  Flashing Lights - Police, Firefighting, Emerge...           8.49   
2                                         Thronefall           5.24   
3  DRAGON QUEST® XI S: Echoes of an Elusive Age™ ...          23.99   
4                                            UNDYING          13.99   

   salePercentage_cleaned  recentReviews_score  allReviews_score  
0                    67.0                   95                95  
1                    66.0                   85                85  
2                    25.0                   95                95  
3                    40.0                   85                85  
4                    30.0                   70                70  
Saved processed data to 'steam_store_data_processed.csv'.
