In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import bokeh as bk
import os

In [None]:
# Load dataset

flags_dir: str = "../datasets/flags/"
flags_path: str = os.path.abspath(os.path.join(flags_dir, "flag.data"))

religions: dict = {0: "Catholic", 1: "Other Christian", 2: "Muslim", 3: "Buddhist", 4: "Hindu", 5: "Ethnic", 6: "Marxist", 7: "Others"}
languages: dict = {1: "English", 2: "Spanish", 3: "French", 4: "German", 5: "Slavic", 6: "Other Indo-European", 7: "Chinese", 8: "Arabic", 9: "Japanese/Turkish/Finnish/Magyar", 10: "Others"}

columns: list = [
    "name",
    "landmass",
    "zone",
    "area",
    "population",
    "language",
    "religion",
    "bars",
    "stripes",
    "colours",
    "red",
    "green",
    "blue",
    "gold",
    "white",
    "black",
    "orange",
    "mainhue",
    "circles",
    "crosses",
    "saltires",
    "quarters",
    "sunstars",
    "crescent",
    "triangle",
    "icon",
    "animate",
    "text",
    "topleft",
    "botright",
]

flags_raw_df = pd.read_csv(flags_path, sep=",", header=None, names=columns)
flags_raw_df

In [None]:
# Plot features

plot_n: int = 20

plot_df = flags_raw_df.copy()
plot_df.sort_values("population", axis=0, inplace=True, ascending=False)
plot_df["religion"] = plot_df["religion"].replace(religions)

pop_rel_fig, pop_rel_ax = plt.subplots()

sns.barplot(plot_df.head(n=plot_n), y="name", x="population", hue="religion", ax=pop_rel_ax, width=1.0, dodge=False)
pop_rel_ax.set_xlabel("Population [Millions, rounded]")
pop_rel_ax.set_ylabel("Country")
pop_rel_ax.set_title(f"{plot_n} Most Populous Countries and Their Associated Religion/Ideology")
pop_rel_ax.get_legend().set_title("Religion/Ideology")

#religion_fig.show()



In [None]:

col_rel_fig, col_rel_ax = plt.subplots()

colors = {"red": "#a00", "orange": "#f80", "white": "#aaa", "green": "#0a0", "black": "#000", "blue": "#00a", "gold": "#ff0"}
res = plot_df[["religion", "mainhue"]].groupby(["religion", "mainhue"]).size()
col_rel_df = res.div(res.unstack().sum(axis=1)).unstack(fill_value=0)


sns.heatmap(col_rel_df, ax=col_rel_ax, cmap="viridis", linewidths=0.5)
col_rel_ax.set_xlabel("Main Hue")
col_rel_ax.set_ylabel("Religion/Ideology")
col_rel_ax.set_title("Fraction of Flags With Each Main Hue by Religion/Ideology")

In [None]:
named_df = flags_raw_df
named_df["language"].replace(languages, inplace=True)
named_df["religion"].replace(religions, inplace=True)

In [None]:
to_one_hot: list = ["mainhue", "landmass", "religion", "zone", "language"]
one_hot_df = named_df

for col in to_one_hot:
    one_hot_df = pd.get_dummies(data=one_hot_df, columns=[col], prefix=col)


In [None]:
to_normalize: list = ["area", "population", "bars", "stripes", "colours", "circles", "crosses", "saltires", "quarters", "sunstars"]
normalized_df = one_hot_df

for col in to_normalize:
    normalized_df[col] = normalized_df[col].div(normalized_df[col].max())

normalized_df