In [1]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import numpy as np

In [None]:
lang_df = pd.read_csv('../data/language_vote_details.csv', low_memory=False)
df = pd.read_csv('../data/cleaned2.csv', low_memory=False)
ranks = pd.read_csv('../data/boardgames_ranks.csv', low_memory=False)

df['categories'] = df['categories']\
    .apply(lambda x: ast.literal_eval(x))
df['mechanics'] = df['mechanics']\
    .apply(lambda x: ast.literal_eval(x))

In [None]:
ranks.rename(columns={"id":"@id"}, inplace = True)
lang_df.rename(columns={"Language Dependency": "lang"}, inplace = True)

df.head(1)

# **Analyses of Language Dependencies for Boardgames**

**Objective:** 

Assessment of boardgames available on Boardgame Geek API (BGG) to determine trends concerning language dependencies for game categories, game mechanics and rankings. Following sections prepare the relevant data and plot trends.

## **Part 1: Metrics for Game Titles, Rankings, and Language Dependencies**

### **1A. Data Prep**

In [None]:
# Data prep. Creating df containing columns for id, name, year, language, ranks 

merge_df = pd.merge(df[["@id", "name" , "year"]], ranks[["@id","rank"]], on = "@id", how = "inner")
games_df = pd.merge(merge_df, lang_df[["@id","lang"]], on = "@id", how = "inner")   

games = games_df.copy()
games.dropna(inplace = True)
display(games.head(2))

In [None]:
# bin by decade
bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020, 2030]
labels = ["1960s", "1970s", "1980s", "1990s", "2000s", "2010s", "2020s"]
games["decade"] = pd.cut(games["year"], bins = bins, labels = labels, right = False)


#display df output and some ranking info
no_rank = (games["rank"] != 0).sum()
display(games.head())
print(f"Number of games that have rankings: {no_rank}")
print(f"Lowest game ranking is: {games['rank'].max()}")

In [None]:
# add columns for grouped game ranks (e.g. games ranked 1 - 100, games ranked 101-500, etc)

def rank_groups(value, label):
    if value == "":
        return pd.NA
    value = int(value)
    
    if label == grouped_rankings[0] and value <= 100:
        return "True"
    elif label == grouped_rankings[1] and value >= 101 and value <= 500:
        return "True"
    elif label == grouped_rankings[2] and value >= 501 and value <= 1000:
        return "True"
    elif label == grouped_rankings[3] and value >= 1001 and value <= 2500:
        return "True"
    elif label == grouped_rankings[4] and value >= 2501 and value <= 4999:
        return "True"
    elif label == grouped_rankings[5] and value > 5000:
        return "True"
    else:
        return pd.NA

grouped_rankings = ["1 - 100", "101-500","501 - 1000", "1001 - 2500", "2501 - 4999", "5000+"]


for label in grouped_rankings:
    games[label] = games["rank"].apply(lambda row: rank_groups(row, label))
    
    
games

In [None]:
games = games.convert_dtypes()

In [None]:
#creating df for games that have been ranked, non ranked were dropped

drop_zero = games["rank"] == 0
games_rankings = games.drop(games[drop_zero].index)


### **1B. Pivots and Plots**

In [None]:
#Pivoted to see if anything interesting regarding language dependencies and game popularity (grouped by rankings groups)

rankings_vs_lang = pd.pivot_table(games_rankings, 
                                index = 'lang', 
                                values = ["1 - 100", "101-500","501 - 1000", "1001 - 2500", "2501 - 4999", "5000+"],
                                aggfunc='count')

rankings_vs_lang = rankings_vs_lang[grouped_rankings]

#updated the table's values to percentages.

def convert_to_percent(col):
    total_sum = col.sum()
    return col / total_sum

rankings_vs_lang_perc = rankings_vs_lang.apply(convert_to_percent) 

display(rankings_vs_lang_perc)

In [None]:
#setting up df info for plotting

perc_fully = rankings_vs_lang_perc.iloc[1,:].tolist()
perc_extensively = rankings_vs_lang_perc.iloc[0,:].tolist()
perc_moderately = rankings_vs_lang_perc.iloc[2,:].tolist()
perc_somewhat = rankings_vs_lang_perc.iloc[4,:].tolist()
perc_not_dependent = rankings_vs_lang_perc.iloc[3,:].tolist()

bar_width = 0.1

# note chatgpt assisted code for ticks and bar positioning.
x_positions = [i for i in range(len(grouped_rankings))]

# plotting the bars graph
fig, ax = plt.subplots(figsize=(10, 6))

ax.bar([pos - 2*bar_width for pos in x_positions], perc_fully, bar_width, label="05 - Fully Language Dependent")
ax.bar([pos - bar_width for pos in x_positions], perc_extensively, bar_width, label='04 - Extensively Language Dependent')
ax.bar(x_positions, perc_moderately, bar_width, label='03 - Moderately Language Dependent')
ax.bar([pos + bar_width for pos in x_positions], perc_somewhat, bar_width, label='02 - Somewhat Language Dependent')
ax.bar([pos + 2*bar_width for pos in x_positions], perc_not_dependent, bar_width, label='01 - Not Language Dependent')


# adding the labels and title
ax.set_xlabel('Game Rankings')
ax.set_ylabel('Percentage')
ax.set_title("Language Dependencies of Top Games")
ax.legend(loc="lower left", bbox_to_anchor=(1, 0))

plt.xticks(x_positions, grouped_rankings)

# Show the plot
plt.show()

In [None]:
#plot lang and decades
lang_decades = games[["lang", "decade"]].copy()
lang_options = lang_decades["lang"].unique()

lang_vs_dec = pd.pivot_table(lang_decades, 
                                index = 'lang', 
                                columns = "decade",
                                aggfunc='size')

lang_vs_dec_perc = lang_vs_dec.apply(convert_to_percent).round(2)
display(lang_vs_dec_perc)
lang_vs_dec.plot()

In [None]:
#plot number of games and lang dependency

lang_vsgames = pd.pivot_table(lang_decades, 
                                columns = 'lang', 
                                aggfunc='size')

lang_games_df = lang_vsgames.reset_index(name="no of games")

total_games = lang_games_df["no of games"].sum()

lang_games_df["percent"] = (lang_games_df["no of games"]/total_games).round(2)

display(lang_games_df)

In [None]:
#plot lang dependency vs categories

In [None]:
cat_merge_df = pd.merge(games_df[["@id", "name" , "year", "lang"]], df[["@id","categories", "mechanics"]], on = "@id", how = "inner")

In [None]:
def metric_by_year(data, metric):
    new_df = data[['lang', metric]].copy()
    new_df = new_df.explode(metric).reset_index(drop=True)
    return new_df
    
lang_cats = metric_by_year(cat_merge_df, 'categories')
lang_mechs = metric_by_year(cat_merge_df, 'mechanics')
display(lang_cats.head())
lang_mechs.head()


In [None]:
lang_vscats = pd.pivot_table(lang_cats, 
                              index = "categories",
                                columns = 'lang', 
                                aggfunc='size')

display(lang_vscats)
lang_vscats_perc = lang_vscats.apply(convert_to_percent).round(2)
lang_vscats_perc

In [None]:
lang_vsmech = pd.pivot_table(lang_mechs, 
                              index = "mechanics",
                                columns = 'lang', 
                                aggfunc='size')

display(lang_vsmech)
lang_vsmech_perc = lang_vsmech.apply(convert_to_percent).round(2)
lang_vsmech_perc