<a href="https://colab.research.google.com/github/theresaskruzna/riiid_knowledge_tracing/blob/main/07_Functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# System

## Check memory usage

In [None]:
def memory_usage(df):
    return f"{df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB"

# Visualisations

## Histogram

In [None]:
def plot_histogram(data, column, bins=None, hue=None, title=None):
    """
    Creates a simple histogram for a specified column.

    Parameters:
    - data: Pandas DataFrame
    - column: Column name to plot
    - bins: Optional number of bins (none/default uses seaborn's auto bin selection)
    - hue: Optional categorical column for grouping
    - title: Optional plot title
    """
    plt.figure(figsize=(8, 5)) # Create a 'canvas' to draw the plot onto
    # plt.figure(figsize=figsize) - alternate code if parameter given in function call
    sns.histplot(data=data, x=column, bins=bins, hue=hue, kde=True) # create histogram
    # df, column, number of bins, split into categories if provided, line on top of the histogram bars

    # Custom labels and title
    plt.xlabel(column) # horizontal line label
    # plt.xlabel(xlabel if xlabel else column) - alternate code to add label
    plt.ylabel("Count") # vertical line label
    if title:
        plt.title(title) # add title to plot if provided in function call

    plt.show() # display plot

# Example usage:
# plot_histogram(df, 'column_name', bins=20, hue='category_column', title='Histogram')

# additional parameters:
# color='blue', xlabel=None, ylabel="Count", figsize=(10, 6), kde=False)
"""
    Parameters:
    - color: Optional color for the histogram bars.
    - xlabel: Optional x-axis label (default uses column name).
    - ylabel: Y-axis label (default="Count").
    - figsize: Tuple defining the figure size (default=(8,5)).
    - kde: Boolean to include KDE curve overlay (default=True).
    """

## KDE plot

In [None]:
def plot_kde(data, column, hue=None, title=None):
    """
    Creates a simple KDE plot for a specified column.

    Parameters:
    - data: Pandas DataFrame
    - column: Column name to plot
    - hue: Optional categorical column for grouping
    - title: Optional plot title
    """
    plt.figure(figsize=(8, 5)) # Create a 'canvas' to draw the plot onto
    # plt.figure(figsize=figsize) - alternate code if parameter given in function call
    sns.kdeplot(data=data, x=column, hue=hue, fill=True) # create KDE plot
    # df, column, split into categories if provided, fill area under line with colour

    # Custom labels and title
    plt.xlabel(column) # horizontal line label
    # plt.xlabel(xlabel if xlabel else column) - alternate code to add label
    plt.ylabel("Density") # vertical line label
    if title:
        plt.title(title) # add title to plot if provided in function call

    plt.show() # display plot

# Example usage:
# plot_kde(df, 'column_name', hue='category_column', title='KDE Plot')

# additional parameters:
# bw_adjust=1, color=None, xlabel=None, ylabel="Density", figsize=(8, 5)
 """
    Parameters:
    - bw_adjust: Bandwidth adjustment factor (default=1).
    - color: Optional color for the KDE curve.
    - xlabel: Optional x-axis label.
    - ylabel: Y-axis label (default="Density").
    - figsize: Tuple defining the figure size (default=(8,5)).
    """

## Bar plot

In [None]:
def plot_bar(data, x, y=None, title=None):
    """
    Creates a customizable bar plot.

    Parameters:
    - data: Pandas DataFrame containing the data.
    - x: Column name for the x-axis categories.
    - y: Optional column name for the y-axis values (if None, counts are used).
    - hue: Optional column for grouping (creates grouped bars).
    - color: Optional color for the bars.
    - title: Optional title for the plot.
    - xlabel: Optional x-axis label (default uses x column name).
    - ylabel: Optional y-axis label (default uses y column name or "Count").
    - figsize: Tuple defining the figure size (default=(8,5)).
    """
    plt.figure(figsize=(8, 5)) # Create a 'canvas' to draw the plot onto
    # plt.figure(figsize=figsize) - alternate code if parameter given in function call

    # Create either a value plot or count plot
    # If y is provided, plot values; otherwise plot counts
    if y:
        sns.barplot(data=data, x=x, y=y, hue=hue, color=color)
    else:
        sns.countplot(data=data, x=x, hue=hue, color=color)

    # Custom labels and title
    plt.xlabel(x) # horizontal line label
    # plt.xlabel(xlabel if xlabel else x) - alternate code to add label
    plt.ylabel(y if y else "Count") # vertical line label
    if title:
        plt.title(title) # add title to plot if provided in function call

    # Rotate x-axis labels if there are many categories to prevent label overlap
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout() # adjusts the plot padding to ensure all labels fit within the figure without being cut off

    plt.show() # display plot

# Example usage:
# plot_bar(df, x='category_column')                       # Simple count plot
# plot_bar(df, x='category_column', y='value_column')     # Value-based bar plot
# plot_bar(df, x='category_column', hue='group_column')   # Grouped bar plot

# additional parameters:
# hue=None, color=None, xlabel=None, ylabel=None, figsize=(8, 5)
    """
    Parameters:
    - hue: Optional column for grouping (creates grouped bars).
    - color: Optional color for the bars.
    - xlabel: Optional x-axis label (default uses x column name).
    - ylabel: Optional y-axis label (default uses y column name or "Count").
    - figsize: Tuple defining the figure size (default=(8,5)).
    """

## Box plot

In [None]:
def plot_boxplot(data, x=None, y=None, hue=None, color=None, title=None, horizontal=False):
    """
    Creates a customizable box plot.

    Parameters:
    - data: Pandas DataFrame containing the data.
    - x: Column name for the x-axis (categorical).
    - y: Column name for the y-axis (numeric).
    - hue: Optional column for additional grouping.
    - color: Optional color for the boxes.
    - title: Optional title for the plot.
    - horizontal: Boolean to plot horizontally (default is vertical).
    """
    # Set orientation
    orient = 'h' if horizontal else 'v'

    plt.figure(figsize=(8, 5)) # Create a 'canvas' to draw the plot onto
    # plt.figure(figsize=figsize) - alternate code if parameter given in function call
    sns.boxplot(data=data, x=x, y=y, hue=hue, orient=orient) # Create the boxplot

    # Custom labels and title
    if horizontal:
        plt.xlabel(y)  # In horizontal orientation, y values are on x-axis
        plt.ylabel(x)  # In horizontal orientation, x values are on y-axis
    else:
        plt.xlabel(x)
        plt.ylabel(y)
        # In vertical orientation, x values are on x-axis, y values are on y-axis
    if title:
        plt.title(title) # add title to plot if provided in function call

    # Handle label rotation for vertical plots with many categories
    if orient == 'v' and x is not None:
        plt.xticks(rotation=45, ha='right')

    plt.tight_layout() # adjusts the plot padding to ensure all labels fit within the figure without being cut off
    plt.show() # display plot

# Example usage:
# plot_boxplot(df, x='category', y='value')                  # Basic vertical boxplot
# plot_boxplot(df, x='value', y='category', orient='h')      # Horizontal boxplot
# plot_boxplot(df, x='category', y='value', hue='group')     # Grouped boxplot

    # additional parameters
    # xlabel=None, ylabel=None, figsize=(8, 5), orient='v', showfliers=True, palette=None
    """
    Parameters:
    - color: Optional color for the boxes.
    - xlabel: Optional x-axis label.
    - ylabel: Optional y-axis label.
    - figsize: Tuple defining the figure size (default=(8,5)).
    - orient: Orientation of the plot ('v' for vertical, 'h' for horizontal).
    - showfliers: Whether to show outlier points (default=True).
    - palette: Color palette for groups when hue is specified.
    """

## Scatter plot

In [None]:
def plot_scatterplot(data, x, y, hue=None, title=None):
    """
    Creates a customizable scatter plot.

    Parameters:
    - data: Pandas DataFrame containing the data.
    - x: Column name for the x-axis.
    - y: Column name for the y-axis.
    - hue: Optional column for color grouping.
    - title: Optional title for the plot.
    """
    plt.figure(figsize=(8, 5)) # Create a 'canvas' to draw the plot onto
    # plt.figure(figsize=figsize) - alternate code if parameter given in function call

    # Create the scatter plot
    sns.scatterplot(data=data, x=x, y=y, hue=hue)

    # Custom labels and title
    plt.xlabel(x) # horizontal line label
    # plt.xlabel(xlabel if xlabel else x) - alternate code to add label
    plt.ylabel(y) # vertical line label
    # plt.ylabel(ylabel if ylabel else y) - alternate code to add label
    if title:
        plt.title(title) # add title to plot if provided in function call

    plt.tight_layout() # adjusts the plot padding to ensure all labels fit within the figure without being cut off
    plt.show() # display plot

# Example usage:
# plot_scatterplot(df, x='age', y='salary')                      # Basic scatter plot
# plot_scatterplot(df, x='age', y='salary', hue='department')    # Colored by group
# plot_scatterplot(df, x='age', y='salary', size='experience')   # Sized by variable

    # additional parameters
    # size=None, style=None, color=None, alpha=0.7, xlabel=None, ylabel=None, figsize=(8, 6), legend='auto', palette=None
    """
    Parameters:
    - size: Optional column for point size variation.
    - style: Optional column for point style variation.
    - color: Optional fixed color for all points (ignored if hue is specified).
    - alpha: Transparency level of points (0 to 1).
    - xlabel: Optional x-axis label (default uses x column name).
    - ylabel: Optional y-axis label (default uses y column name).
    - figsize: Tuple defining the figure size (default=(8,6)).
    - legend: Legend position ('auto', 'brief', 'full', or False).
    - palette: Color palette for groups when hue is specified.
    """

## Heat map

In [None]:
def plot_heatmap(data, annot=True, cmap="viridis", title=None, xlabel=None, ylabel=None):
    """
    Creates a customizable heatmap.

    Parameters:
    - data: DataFrame or 2D array to visualize as heatmap.
    - annot: Whether to annotate cells with values (default=True).
    - cmap: Colormap name or object (default="viridis").
    - title: Optional title for the plot.
    - xlabel: Optional x-axis label.
    - ylabel: Optional y-axis label.
    """
    plt.figure(figsize=(8, 5)) # Create a 'canvas' to draw the plot onto
    # plt.figure(figsize=figsize) - alternate code if parameter given in function call

    # Create the heatmap
    heatmap = sns.heatmap(data, annot=annot, cmap=cmap)

    # Custom labels and title
    if xlabel:
        plt.xlabel(xlabel) # horizontal line label
    if ylabel:
        plt.ylabel(ylabel) # vertical line label
    if title:
        plt.title(title) # add title to plot if provided in function call

    plt.tight_layout() # adjusts the plot padding to ensure all labels fit within the figure without being cut off
    plt.show() # display plot

    return heatmap # return the heatmap object for further customization if needed

# Example usage:
# plot_heatmap(df.corr())                             # Correlation heatmap
# plot_heatmap(pivot_table, cmap="YlOrRd", fmt=".0f") # Custom colored heatmap
# mask = np.triu(np.ones_like(df.corr(), dtype=bool)) # Upper triangle mask
# plot_heatmap(df.corr(), mask=mask, cmap="coolwarm") # Masked heatmap

    # additional parameters
    # center=None, robust=False, mask=None, figsize=(10, 8), vmin=None, vmax=None, fmt=".2f", linewidths=0, cbar=True
    """
    Parameters:
    - center: Value to center the colormap at (useful for diverging data).
    - robust: If True, compute colormap bounds from robust percentiles.
    - mask: Boolean array or DataFrame to mask cells (True = hidden).
    - figsize: Tuple defining the figure size (default=(10,8)).
    - vmin, vmax: Limits for the colormap.
    - fmt: String formatting for annotations.
    - linewidths: Width of lines between cells.
    - cbar: Whether to draw a colorbar.
    """