In [99]:
# import libraries
import re
import numpy as np
import pandas as pd
from dash import Dash, dcc, html, Input, Output, dash_table
import dash_bootstrap_components as dbc
import plotly.express as px
from jupyter_dash import JupyterDash

In [100]:
def remove_non_english(df):

    # keep only rows marked as supporting english
    df = df[df['english'] == 1].copy()
    
    # remove english column, now redundant
    df = df.drop('english', axis=1)
    
    return df

def remove_less_owned(df):

    # Keeping games with only 20,000 owners or more
    df = data[data.owners >= 20000].copy()
    
    return df

def calc_rating(row):

    # https://steamdb.info/blog/steamdb-rating/

    import math

    pos = row['positive_ratings']
    neg = row['negative_ratings']

    total_reviews = pos + neg
    average = pos / total_reviews
    
    score = average - (average*0.5) * 2**(-math.log10(total_reviews + 1))

    return score * 100

def process_cat_gen_tag(df):
    
    # only going to use these categories
    cat_cols = [
        # 'Local Multi-Player',
        # 'MMO',
        # 'Mods',
        'Multi-player',
        # 'Online Co-op',
        # 'Online Multi-Player',
        'Single-player'
    ]
    
    # create a new column for each category, with 1s indicating membership and 0s for non-members
    for col in sorted(cat_cols):
        col_name = re.sub(r'[\s\-\/]', '_', col.lower())
        col_name = re.sub(r'[()]', '', col_name)
        
        df[col_name] = df['categories'].apply(lambda x: 1 if col in x.split(';') else 0)
        
    # repeat for genre column names
    gen_cols = [
        # 'Accounting',
        'Action',
        'Adventure',
        # 'Animation & Modeling',
        # 'Audio Production',
        'Casual',
        # 'Design & Illustration',
        # 'Documentary',
        # 'Early Access',
        # 'Education',
        # 'Free to Play',
        # 'Game Development',
        # 'Gore',
        'Indie',
        'Massively Multiplayer',
        # 'Nudity',
        # 'Photo Editing',
        'RPG',
        'Racing',
        # 'Sexual Content',
        'Simulation',
        # 'Software Training',
        'Sports',
        'Strategy'
        # 'Tutorial',
        # 'Utilities',
        # 'Video Production',
        # 'Violent',
        # 'Web Publishing'
    ]
    
    gen_col_names = []
    
    # create new columns for each genre with 1s for games of that genre
    for col in sorted(gen_cols):
        col_name = col.lower().replace('&', 'and').replace(' ', '_')
        gen_col_names.append(col_name)
        
        df[col_name] = df['genres'].apply(lambda x: 1 if col in x.split(';') else 0)
    
    # remove "non-games" based on genre
    gen_sums = df[gen_col_names].sum(axis=1)
    df = df[gen_sums > 0].copy()

    # remove redundant columns
    df = df.drop(['categories', 'steamspy_tags'], axis=1)
    
    # transform developer column
    df['top_genre'] = df['genres'].apply(lambda x: x.split(';')[0])
    df.loc[~df["top_genre"].isin(gen_cols), "top_genre"] = "Other"
    
    return df

def get_scores(df):
    
    import scipy.stats as stats
    
    # extracting values
    num_releases = df.num_releases**(1/2)
    log_owners = np.log10(df.sum_owners)
    mean_rating = df.mean_rating
    log_ratings = np.log10(df.total_ratings)
    
    # calculating z scores
    z_num_releases = stats.zscore(num_releases)
    z_log_owners = stats.zscore(log_owners)
    z_mean_rating = stats.zscore(mean_rating)
    z_log_ratings = stats.zscore(log_ratings)

    # calculate developer scores
    scores = (.1*z_num_releases + .5*.2*z_log_owners + .5*z_mean_rating + .5*.25*z_log_ratings) / 3
        
    return scores

# preprocess for exploratory analysis
def pre_process():
    
    # read in cleaned dataset
    df = pd.read_csv('Resources/Data/steam_cleaned.csv')
    
    # keep english only
    df = remove_non_english(df)
    
    # keep windows only, and remove platforms column
    df = df[df['platforms'].str.contains('windows')].drop('platforms', axis=1).copy()
    
    # keep lower bound of owners column, as integer
    df['owners'] = df['owners'].str.split('-').apply(lambda x: x[0]).astype(int)
    
    # calculate rating, along w/ simple ratio for comparison
    df['total_ratings'] = df['positive_ratings'] + df['negative_ratings']
    df['rating_ratio'] = df['positive_ratings'] / df['total_ratings']
    df['rating'] = df.apply(calc_rating, axis=1)
    
    # convert release_date to dt type and create separate column for release_year
    df['release_date'] = df['release_date'].astype('datetime64[ns]')
    df['release_year'] = df['release_date'].apply(lambda x: x.year)
    
    # process genres and categories
    df = process_cat_gen_tag(df)
    
    return df

data = pre_process()
data.head()

Unnamed: 0,appid,name,release_date,developer,publisher,required_age,genres,achievements,positive_ratings,negative_ratings,...,adventure,casual,indie,massively_multiplayer,rpg,racing,simulation,sports,strategy,top_genre
0,10,Counter-Strike,2000-11-01,Valve,Valve,0,Action,0,124534,3339,...,0,0,0,0,0,0,0,0,0,Action
1,20,Team Fortress Classic,1999-04-01,Valve,Valve,0,Action,0,3318,633,...,0,0,0,0,0,0,0,0,0,Action
2,30,Day of Defeat,2003-05-01,Valve,Valve,0,Action,0,3416,398,...,0,0,0,0,0,0,0,0,0,Action
3,40,Deathmatch Classic,2001-06-01,Valve,Valve,0,Action,0,1273,267,...,0,0,0,0,0,0,0,0,0,Action
4,50,Half-Life: Opposing Force,1999-11-01,Gearbox Software,Valve,0,Action,0,5250,288,...,0,0,0,0,0,0,0,0,0,Action


In [101]:
# get top developers by score
pv_df = remove_less_owned(data)
pv_df['num_releases'] = 1
pv = pv_df.pivot_table(index='developer',
                       values=['num_releases', 'owners', 'rating', 'total_ratings', 'price'],
                       aggfunc={'num_releases': np.sum,
                                'owners': np.sum, 
                                'rating': np.mean,
                                'total_ratings': np.sum,
                                'price': np.mean}
                      ).rename({'owners': 'sum_owners', 
                                'price': 'mean_price',
                                'rating': 'mean_rating'}, axis=1)

pv['score'] = get_scores(pv)
pv = pv.sort_values(by='score', ascending=False)

# store top 10 developers to later use
top_devs = ['All Developers'] + pv.head(10).index.unique().tolist()

In [104]:
# year markers
year_marks = {
    1999: {"label": "1999"},
    2003: {"label": "2003"},
    2006: {"label": "2006"},
    2009: {"label": "2009"},
    2012: {"label": "2012"},
    2015: {"label": "2015"},
    2018: {"label": "2018"}
}

# main genres
gen_cols = data.columns[-11:-1]

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

# Create server variable with Flask server object for use with gunicorn
server = app.server

app.layout = html.Div([
    
    html.H2(children="Steam Store Analytics"),
    html.H4(children="A Visualization of Developer Data"),
    
    html.Label('Developer'),
    dcc.Dropdown(
        options=top_devs,
        value='All Developers',
        id='dev-dropdown',
    ),
    
    html.Label(
        ['Adjust slider to desired range:'],
        style={"font-size": "16px", "opacity": "70%"},
    ),
    
    html.P(),
    
    dcc.RangeSlider(
        id='year-slider',
        marks=year_marks,
        step=1,                # number of steps between values
        min=data.release_year.min(),
        max=data.release_year.max(),
        value=[2012,2015],     # default value initially chosen
        dots=True,             # True, False - insert dots, only when step>1
        allowCross=False,      # True,False - Manage handle crossover
        disabled=False,        # True,False - disable handle
        pushable=2,            # any number, or True with multiple handles
        updatemode='mouseup',  # 'mouseup', 'drag' - update value method
        included=True,         # True, False - highlight handle
        vertical=False,        # True, False - vertical, horizontal slider
        verticalHeight=900,    # hight of slider (pixels) when vertical=True
        className='None',
        tooltip={'always_visible':False,  # show current slider values
                 'placement':'bottom'},
    ),
    
    dcc.Graph(id='bubble-chart'),
    
    html.Div([
        dcc.Graph(id='top-games'),
        
        dcc.Graph(id='top-genres')
    ])
])

@app.callback(
    Output('bubble-chart','figure'),
    Output('top-games', 'figure'),
    Output('top-genres', 'figure'),
    [Input('dev-dropdown', 'value'),
     Input('year-slider','value')])
def update_figure(selected_dev, year_range):
    if selected_dev == 'All Developers':
        filter_dev = data.copy()
    else:
        filter_dev = data[data['developer'] == selected_dev]

    if year_range:
        filter_year = filter_dev[
            (filter_dev.release_year >= year_range[0]) & (filter_dev.release_year <= year_range[1])]
    else:
        filter_year = filter_dev[
            (filter_dev.release_year >= 2006) & (filter_dev.release_year <= 2008)]
        
    filter_year = filter_year.loc[filter_year['owners'] > 20000]
    filter_year = filter_year.loc[filter_year['price'] != 0]
    
    fig_1 = px.scatter(filter_year, x="rating", y="total_ratings",
                     color="top_genre", size="owners", hover_name="name",
                     log_y=True, size_max=60)
    
    fig_2 = px.bar(filter_dev.nlargest(10, 'total_ratings'), y='total_ratings', x='name', text_auto='.2s',
            title=f'Top 10 Most Rated Games by {selected_dev}')

    fig_3 = px.bar(filter_dev[gen_cols].mean(),
                   orientation='h',
                   labels={'x': 'Proportion %', 'y':'Genre'},
                   title = f'Proportion of Games Released by {selected_dev}')

    fig_1.update_layout(transition_duration=500)

    return fig_1, fig_2, fig_3

In [105]:
app.run_server(debug=True, use_reloader=False)

Dash app running on http://127.0.0.1:8050/
