# Example 1 - Beginner Friendly
In this Data Analysis will be to practice with `Python and libraries`

[Digimon DB KAGGLE](https://www.kaggle.com/datasets/rtatman/digidb)

This dataset contains information on digimon from “Digimon Digimon Story: Cyber Sleuth”, released for Playstation Vita in 2015 and Playstation 4 in 2016.

**Content:**

This database contains three files: a list of all the digimon that can be captured or fought in Cyber Sleuth, all the moves which Digimon can perform, and all the Support Skills. (Support Skills are a passive, stackable, team-wide buff. Each species of Digimon is associated with a single Support Skill.)

* Which set of moves will get the best ratio of attack power to SP spent?
* Which team of 3 digimon have the highest attack? Defense?
* What’s the tradeoff between HP and SP?
* Are some types over- or under-represented?
* Both the moves and support skills have short text descriptions. Can an NLP analysis reveal underlying clusters of moves?
* Are different types and attributes evenly represented across stages?

## 1. Dependency and Format

Python 3.10.11


In [None]:
# Classic Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from skimpy import clean_columns

# Advanced Visualization Libraries
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud

In [None]:
# Display float with 2 decimal values
pd.options.display.float_format = "{:.2f}".format

#### Helpers

In [None]:
# Checking if I can change the types to more efficient memory usage
import numpy as np

np.iinfo(np.int16)

In [None]:
from IPython.display import display


def create_var(df, var_name):
    """This function is to create a variable with the main DataFrame without change the main one
    Use it when you have to debug or modify anything"""
    globals()[var_name] = df
    return df

## 2. Get and Inspect Data

In [None]:
db = pd.read_csv("data/db_digimonlist.csv")
db_move = pd.read_csv("data/db_movelist.csv")
db_support = pd.read_csv("data/db_supportlist.csv")

In [None]:
# Sample data
db.sample(5)

In [None]:
def check_data(df):
    col_type = df.dtypes
    missing_values = df.isnull().sum()
    uniques = df.nunique()
    df1 = pd.concat(
        {
            "type": col_type,
            "missing_values": missing_values,
            "uniques": uniques,
        },
        axis=1,
    ).reset_index(names="cols_name")
    print(
        f"Dimension of DataFrame: ({df.shape[0]}, {df.shape[1]})\n",
    )
    print(f"DataFrame INFO:\n{df1}\n")


check_data(db)

In [None]:
# Setting the right index
db.set_index("Number", inplace=True)
db.index.name = None
db.head()

In [None]:
cols = db.columns.to_list()
cols

In [None]:
cols = db.columns.to_list()
cols

In [None]:
# Get an overview of the data
db.info()

In [None]:
# Generate descriptive statistics
db.describe().T

In [None]:
db = db[cols].astype(
    {
        "Memory": "int16",
        "Equip Slots": "int16",
        "Lv 50 HP": "int16",
        "Lv50 SP": "int16",
        "Lv50 Atk": "int16",
        "Lv50 Def": "int16",
        "Lv50 Int": "int16",
        "Lv50 Spd": "int16",
    }
)

In [None]:
db.info()

In [None]:
# Check missing values
db.isna().sum()

In [None]:
# Check for missing  values in %
db.isna().mean().mul(100)
# Mean is the AVG and its multiply by 100

In [None]:
# Display rows with missing values
db[db.isnull().any(axis="columns")]

In [None]:
# Inspect highest avg percentage Attack / Defense
db.nlargest(5, columns=["Lv50 Atk", "Lv50 Def"])

In [None]:
# Most Features (columns) are showing a difference between mean and median
# This indicates that the data is skewed
hist = db.hist(bins=15, figsize=(16, 10))

* `As we can see here we have `OUTLIERS in ATK`, also we have less than 10 Digimons with a high performance`

### HeatMaps with PANDAS and SNS

In [None]:
# Check correlation of the features (columns)
def df_corr(df):
    plt.figure(figsize=(8, 5))
    corr = df.corr(numeric_only=True)
    matrix = np.triu(corr)
    # Plot HeatMap with SNS
    sns.heatmap(
        corr,
        annot=True,
        mask=matrix,
        cmap="RdBu",
        vmin=-1,
        vmax=1,
    )


df_corr(db)

In [None]:
# Plot pairwise relationships
sns.pairplot(db)

In [None]:
# Show outliers
db["Lv50 Atk"].nlargest()

In [None]:
def detect_outliers(df, num_var):
    trace0 = go.Box(
        y=df[num_var],
        name="All Points",
        jitter=0.3,
        pointpos=-1.8,
        boxpoints="all",
        marker=dict(color="#a2b9bc"),
        line=dict(color="#6b5b95"),
    )

    trace1 = go.Box(
        y=df[num_var],
        name="Only Whiskers",
        boxpoints=False,
        marker=dict(color="#b2ad7f"),
        line=dict(color="#feb236"),
    )

    trace2 = go.Box(
        y=df[num_var],
        name="Suspected Outliers",
        boxpoints="suspectedoutliers",
        marker=dict(
            color="#b5e7a0",
            outliercolor="#878f99",
            line=dict(outliercolor="#d64161", outlierwidth=2),
        ),
        line=dict(color="#86af49"),
    )

    trace3 = go.Box(
        y=df[num_var],
        name="Whiskers and Outliers",
        boxpoints="outliers",
        marker=dict(color="#6b5b95"),
        line=dict(color="#ff7b25"),
    )

    data = [trace0, trace1, trace2, trace3]

    layout = go.Layout(title="{} Outliers".format(num_var))

    layout = go.Layout(
        title={
            "text": num_var,
            "y": 0.9,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        barmode="overlay",
        yaxis=dict(title="Count"),
        template="plotly_dark",
    )

    fig = go.Figure(data=data, layout=layout)

    fig.show()


detect_outliers(db, "Lv50 Atk")

In [None]:
# Set upper limit for outliers
q_hi = db["Lv50 Atk"].quantile(0.999)
q_hi

In [None]:
# Plot pairwise relationships w/o outliers
sns.pairplot(db.query("`Lv50 Atk` < @q_hi"))

## 3. Clean up the DataFrame

### 3.1 Using pandas to clean the data with method chaining

In [None]:
# Chain PANDAS method
(
    db.query("`Lv50 Atk` > 100")  # Using SQL method
    .filter(
        regex="^D", axis="columns"
    )  # Filter with regex all the COLUMNS NAME start with D
    .rename(columns={"Digimon": "Digimon_Name"})  # Rename the Column Name-
)

In [None]:
db.head(3)

In [None]:
# Function to CLEAN
def clean_df(df, views_threshold=100):
    """
    Purpose: Clean the DataFrame
    """
    #  remove_cols=['Number']
    # remove_nan_values=['']
    return (
        df
        #   .drop(labels=remove_cols,axis='columns')
        #    .dropna(subset=remove_nan_values)
        .dropna()
        .pipe(
            clean_columns
        )  # Cleaning empty spaces and special characters. Imported at the beginning
        .rename(
            columns={
                "lv_50_hp": "hp",
                "lv_50_sp": "sp",
                "lv_50_atk": "atk",
                "lv_50_def": "def",
                "lv_50_int": "int",
                "lv_50_spd": "spd",
            }
        )
        .query("atk > @views_threshold")
        .assign(  # assign create a new column or edit the existing one
            #        atk=lambda df_: pd.to_datetime(df_.)
            atk_sp_ratio=lambda df_: ((df_.sp / df_.atk) * 100).round(2)
        )
    )


# end def

df = clean_df(db)
df.head(3)

## 4. Plot data

In [None]:
def display_topn_cat_val(df, feature):
    df_ = df[feature].value_counts()
    fig = px.bar(
        y=df_.values,
        x=df_.index,
        text=df_.values,
        title="Amount of Digimon by Stage",
    )
    fig.update_layout(
        title={
            "text": feature.title(),
            "y": 0.9,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        barmode="overlay",
        yaxis=dict(title="Count"),
        xaxis=dict(title="Stage"),
        template="plotly_dark",
    )
    fig.show()


display_topn_cat_val(df, "stage")

In [None]:
# ax = (df
#  .stage
#  .value_counts()
#  .plot(kind='bar',figsize=(12,6),rot=0, title='Amount of Digimon by Stage', xlabel='Stage', ylabel='Amount')
#  )
# ax.bar_label(ax.containers[0])

# plt.tight_layout()

In [None]:
def summary_count(df, cols):
    colors = [
        "#a2b9bc",
        "#6b5b95",
        "#b2ad7f",
        "#feb236",
        "#b5e7a0",
        "#878f99",
        "#d64161",
        "#86af49",
        "#ff7b25",
    ]

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=("Countplot", "Percentages"),
        specs=[[{"type": "xy"}, {"type": "domain"}]],
    )

    x = [str(i) for i in df[cols].value_counts().index]
    y = df[cols].value_counts().values.tolist()

    fig.add_trace(
        go.Bar(
            x=x,
            y=y,
            text=y,
            textposition="auto",
            showlegend=False,
            marker=dict(color=colors, line=dict(color="black", width=2)),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Pie(
            labels=df[cols].value_counts().keys(),
            values=df[cols].value_counts().values,
            hoverinfo="label",
            textinfo="percent",
            textfont_size=20,
            textposition="auto",
            marker=dict(colors=colors, line=dict(color="black", width=2)),
        ),
        row=1,
        col=2,
    )

    fig.update_layout(
        title={
            "text": cols.title(),
            "y": 0.9,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        template="plotly_dark",
    )

    fig.show()


for i in ["stage", "type"]:
    summary_count(df, i)

#### Which team of 3 digimon have the highest attack? Defense?


In [None]:
def plot_column(df, column, top_values=10, color="#d33682"):
    df_total = []
    title = f"<b>{column[0].replace('_',' ').title()} and {column[1].replace('_',' ').title()} - Top {top_values} Digimons</b>"
    mean = df[column].mean()
    median = df[column].median()
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=(
            f"{column[0].replace('_',' ').title()}",
            f"{column[1].replace('_',' ').title()}",
        ),
        shared_xaxes=False,
        shared_yaxes=False,
        vertical_spacing=0.1,
    )
    for i, col in enumerate(column):
        df_col = df.nlargest(top_values, columns=col).sort_values(
            by=col, ascending=True
        )
        df_total.append(df_col)
        mean = df_total[i][column[i]].mean()
        median = df_total[i][column[i]].median()
        # Plot 1 - Bar chart
        fig.add_trace(
            go.Bar(
                x=df_total[i][col],
                y=df_total[i]["digimon"],
                orientation="h",
                marker_color=color,
                text=df_total[i]["type"],
                opacity=1,
                hovertemplate="<b> Digimon:%{y} </br><b> "
                f"{col.title()}"
                ":%{x} <br> Type:%{text}",
                name="",
            ),
            row=i + 1,
            col=1,
        )

        # add a vertical "MEAN" line
        fig.add_shape(
            type="line",
            x0=mean,
            y0=0,
            x1=mean,
            y1=top_values - 0.5,
            line=dict(color="purple", width=3, dash="dot"),
            row=i + 1,
            col=1,
            visible=True,
            templateitemname=mean,
            name="mean",
        )
        # Add a text annotation for the MEAN value
        fig.add_annotation(
            x=mean,
            y=top_values,
            text="Mean",
            hovertext=f"Mean: {mean:.2f}",
            align="center",
            row=i + 1,
            col=1,
            textangle=50,
            font=dict(size=10, color="gray"),
            xshift=-10,
        )

        # add a vertical "MEDIAN" line
        fig.add_shape(
            type="line",
            x0=median,
            y0=0,
            x1=median,
            y1=top_values - 0.5,
            line=dict(color="salmon", width=3, dash="dot"),
            row=i + 1,
            col=1,
        )
        # Add a text annotation for the MEDIAN value
        fig.add_annotation(
            x=median,
            xref="x",
            y=top_values,
            yref="y",
            text="Median",
            hovertext=f"Median: {median:.2f}",
            align="center",
            row=i + 1,
            col=1,
            textangle=50,
            font=dict(size=10, color="gray"),
            opacity=1,
        )

    fig.update_layout(
        title_text=title,
        uirevision=dict(editable=False),
        bargap=0,
        yaxis1=(
            dict(
                showgrid=False,
                title=None,
                showticklabels=True,
                visible=True,
                zeroline=False,
            )
        ),
        yaxis2=(
            dict(
                showgrid=False,
                title=None,
                showticklabels=True,
            )
        ),
        xaxis1=(
            dict(
                showgrid=False,
                title=None,
                showticklabels=True,
                visible=True,
                zeroline=False,
            )
        ),
        xaxis2=(
            dict(
                showgrid=False,
                title=None,
                showticklabels=True,
                visible=True,
                zeroline=False,
            )
        ),
        margin=dict(l=0, r=0, t=50, b=15),
        showlegend=False,
        hoverlabel=dict(
            bgcolor="#ff8fc5",
            font_family="Times New Roman",
        ),
        template="plotly_white",
        annotations=[
            dict(
                showarrow=False,
            )
        ],
    )
    return fig


fig = plot_column(df=df, column=["atk", "def"], top_values=3)
fig.show()

### Are some types over- or under-represented?


In [None]:
df[["type", "stage"]].value_counts()

In [None]:
# df = data.groupby(['pulse', 'diet']).count()['time']

df_group = df.groupby(["stage", "type"])["digimon"].count()
df_group

In [None]:
df_group = df_group.reset_index()
df_group

In [None]:
fig = px.bar(
    df_group,
    x="stage",
    y="digimon",
    title="Stage Of Digimon and Amount of each",
    color="type",
    text="type",
    template="plotly_white",
)
fig.update_layout(
    barmode="group",
    plot_bgcolor="rgba(0,0,0,0)",
    yaxis=(dict(showgrid=False, title=None)),
    xaxis=(dict(showgrid=False, title=None, categoryorder="total descending")),
    legend=(dict(title={"text": "Type"})),
)

fig.update_traces(
    texttemplate="%{value}",
    textposition="outside",
    hovertemplate="<br>".join(
        ["Stage: %{x}", "Number of Digimons: %{y}", "Type: %{text}"]
    ),
)

fig.show()
# print("plotly express hovertemplate:", fig.layout)
# print("plotly express hovertemplate:", fig.data[0].hovertemplate)
#

What’s the tradeoff between HP and SP?


In [None]:
df[["hp", "sp"]]

* Both the moves and support skills have short text descriptions. Can an NLP analysis reveal underlying clusters of moves?
* Are different types and attributes evenly represented across stages?

In [None]:
df_pie = df.groupby("attribute").stage.value_counts(ascending=False)

In [None]:
# OPTIONAL
fig.write_html("digimon-chart.html")

Both the moves and support skills have short text descriptions. Can an NLP analysis reveal underlying clusters of moves?
Are different types and attributes evenly represented across stages?

In [None]:
db_move.columns.tolist()

In [None]:
db_move.columns.str.lower()

In [None]:
# Function to CLEAN
def clean_df(df, views_threshold=100):
    """
    Purpose: Clean the DataFrame
    """
    #  remove_cols=['Number']
    # remove_nan_values=['']
    return (
        df
        #   .drop(labels=remove_cols,axis='columns')
        #    .dropna(subset=remove_nan_values)
        .dropna()
        .pipe(
            clean_columns
        )  # Cleaning empty spaces and special characters. Imported at the beginning
        .rename(columns=str.lower)
        .query("power > @views_threshold")
        .assign(  # assign create a new column or edit the existing one
            #        atk=lambda df_: pd.to_datetime(df_.)
            power_sp_ratio=lambda df_: ((df_.sp_cost / df_.power) * 100).round(
                2
            )
        )
    )


# end def

df_move = clean_df(db_move)
df_move.head(3)

### Which set of moves will get the best ratio of attack power to SP spent?


In [None]:
df_move.nlargest(3, columns="power_sp_ratio")

In [None]:
db_move[["Move", "SP Cost", "Power"]]

In [None]:
df_move['sp_power'] = df_move.groupby(['power_sp_ratio']).size()
threshold = 200

# Count the frequency of each category
counts = db_move['sp_power'].value_counts(ascending=False)
# Identify rare categories
rare_categories = counts[counts <= threshold]
rare_categories

In [None]:
df_move

In [None]:
db_move_sp_power = db_move.groupby(["Move", "SP Cost", "Power"]).size()


fig = go.Figure()
categories = ['Weak', 'Medium', 'Strong']
fig.add_trace(go.Scatterpolar(
            r = db_move_sp_power.unique(), theta = categories, 
            fill = 'toself', name = '2020 remote ratio'))

fig.add_trace(go.Scatterpolar(
            r = ratio_2021, theta = categories,
            fill = 'toself', name = '2021 remote ratio'))

fig.add_trace(go.Scatterpolar(
            r = ratio_2022, theta = categories,
            fill = 'toself', name = '2022 remote ratio'))

fig.add_trace(go.Scatterpolar(
            r = ratio_2023, theta = categories,
            fill = 'toself', name = '2023 remote ratio'))
fig.update_layout(title={'text': "Remote Ratio by Work Year",
                             'y':0.9,
                             'x':0.5,
                             'xanchor':'center',
                             'yanchor':'top'},
                             barmode='overlay',
                             yaxis=dict(title='Count'),
                             template = 'plotly_dark')
fig.show()