
<p style="border:2px solid black"> </p>
<span style="font-family:Lucida Bright;">
<p style="margin-bottom:0.8cm"></p>
<center>
<font size="6"><b>Understanding Music Listening Habits</b></font>
<p style="margin-bottom:-0.1cm"></p>
<font size="6"><b>Using Large-scale Smartphone Data</b>  </font>

<p style="margin-bottom:0.5cm"></p>
<font size="3"><b>Wojciech Mazurkiewicz, DTU, 14 May 2021</b></font>
<p style="margin-bottom:1cm"></p>
<font size="5"><b>Initialization</b></font>
<br>
<font size="3"><b></b></font>
</center>
<p style="margin-bottom:0.4cm"></p>
<p style="border:2px solid black"> </p>

    

# Imports
<p style="border:2px solid black"> </p>


The imports are defined in [toolbox/imports.py](toolbox/imports.py)

In [1]:
from toolbox.imports import *

#  Configuration
<p style="border:2px solid black"> </p>


## Notebook options


In [3]:
# Decide which output is shown below the cells.
InteractiveShell.ast_node_interactivity = "none"

## Matplotlib options


In [4]:
# Show matplotlib plots inline.
%matplotlib inline

## Pandas options


In [5]:
# Define the format in which the numbers will be shown in
# the pandas dataframes.
pd.options.display.float_format = '{:,.2f}'.format

# Decide how to handle the "SettingWithCopyWarning" warning
pd.options.mode.chained_assignment = 'warn'  # set to None to disable

# At multirow, top-align (False) or center-align (True)
pd.options.display.latex.multirow = False

# Set the maximum number of rows and columns to show when
# displaying a Pandas dataframe.
pd.options.display.max_rows = 150
pd.options.display.max_columns = 200

## PySpark options

In [None]:
# Define the AVRO jar path
avro_jar_path = '/data/work/shared/tools/spark-avro_2.12-3.0.0.jar'

# Add the avro jar path to the list of environmental variables
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    f'--jars {avro_jar_path}'
    f' pyspark-shell'
)

## Warnings


In [6]:
# Decide how to handle warnings.
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=pd.errors.PerformanceWarning)

# Paths
<p style="border:2px solid black"> </p>


In [7]:
# Project root.
path_root = Path(
    r'C:\GDrive\DTU\Kurser\Social_Data_Analysis_and_Visualization_02806\final_project')

# Resources root.
path_resources = path_root / 'resources'

# Function definitions
<p style="border:2px solid black"> </p>


## General functions


In [8]:
# A function that returns a dict of object attributes.
def get_obj_attributes(obj):
    return {attribute_name: getattr(obj, attribute_name)
            for attribute_name in dir(obj)
            if (not attribute_name.startswith('__')
                and not callable(getattr(obj, attribute_name)))}


# A function that returns a dict of object methods.
def get_obj_methods(obj):
    return {method_name: getattr(obj, method_name)
            for method_name in dir(obj)
            if (not method_name.startswith('__')
            and callable(getattr(obj, method_name)))}


# A function that prints a string in markdown format.
def printmd(string):
    display(Markdown(string))


# A function that returns unique values from a text.
def unique(list_):
    return list(set(list_))

## Plotting functions


In [9]:
# A function that applies default formatting to an axes.
def format_axes(axes: plt.Axes,
                keep_box=False):
    if not keep_box:
        axes.spines['top'].set_color('white')
        axes.spines['right'].set_color('white')

    axes.set_facecolor("white")


# A function that applies default formatting to annotation
# of an axes.
def format_axes_annotation(axes: plt.Axes):
    axes.xaxis.label.set_fontsize(14)
    axes.yaxis.label.set_fontsize(14)
    axes.title.set_fontsize(16)


# A function for creating common x-label for the figure.
def figure_x_label(figure: plt.Figure,
                   label: str,
                   y_position=0.04,
                   font_size=16):
    figure.text(0.5, y_position, label,
                ha='center',
                fontdict={'size': font_size})


# A function for creating common y-label for the figure.
def figure_y_label(figure: plt.Figure,
                   label: str,
                   x_position=0.04,
                   font_size=16):
    figure.text(x_position, 0.5, label,
                va='center',
                rotation='vertical',
                fontdict={'size': font_size})


# A function that draws a horizontal line across the entire axes.
def draw_threshold(value: float,
                   axes: plt.Axes,
                   linewidth=1,
                   linestyle='-',
                   color=None,
                   title=None):

    # Get axes limits and ranges.
    x_min, x_max = axes.get_xlim()
    x_range = x_max - x_min
    y_min, y_max = axes.get_ylim()
    y_range = y_max - y_min

    # Plot the threshold line.
    axes.plot([x_min, x_max], [value, value],
              linewidth=1,
              linestyle='-',
              color=color)

    # Write a title above the threshold line
    if title is not None:
        axes.text(x_min + 0.01 * x_range,
                  value + 0.02 * y_range,
                  title)




##  Dataframe functions


In [10]:
# A function that gets column names of a dataframe.
def get_df_columns(df, exclude=None):
    # If columns to be excluded have not been defined,
    # represent it as an empty list.
    if exclude is None:
        exclude = list()

    # If the columns to be excluded are not specified using a list
    # or a tuple, represent them as a list.
    elif not isinstance(exclude, (list, tuple)):
        exclude = [exclude]

    # Return all column names except the ones to exclude.
    return [column for column in df.columns.to_list()
            if column not in exclude]


# A function that gets rows names of a dataframe.
def get_df_rows(df, exclude=None):
    # If columns to be excluded have not been defined,
    # represent it as an empty list.
    if exclude is None:
        exclude = list()

    # If the columns to be excluded are not specified using a list
    # or a tuple, represent them as a list.
    elif not isinstance(exclude, (list, tuple)):
        exclude = [exclude]

    # Return all column names except the ones to exclude.
    return [row for row in df.index.to_list()
            if row not in exclude]


# A function that resets the names of indices
def reset_df_index_names(df):
    return (
        df
        .rename_axis(['' for level in range(df.columns.nlevels)],
                     axis="columns")
        .rename_axis(['' for level in range(df.index.nlevels)],
                     axis="rows")
    )


# A function for balancing a dataframe so that the number of rows
# containing each value present in the designated column will be the same.
def balance_dataframe(df: pd.DataFrame, column_name):
    # Get the number of crimes for the least frequent crime.
    lowest_frequency = df['Category'].value_counts().min()

    # Create an empty dataframe for storing the balanced data
    df_balanced = pd.DataFrame()

    # For each value in column, randomly choose the number of samples
    # that corresponds to the least frequent value in the column.
    for value in df[column_name].unique():
        df_balanced = df_balanced.append(
            df
            .loc[df[column_name] == value]
            .sample(lowest_frequency)
        )

    return df_balanced



# A function that sorts the columns in alphabethical order
# and puts the user-chosen columns first
def df_sort_columns(df: pd.DataFrame,
                    first_columns=None):

    # Define a function that moves the chosen element to the
    # front of the list.
    def move_to_front(element, list_):
        if element in list_:
            list_.insert(0, list_.pop(list_.index(element)))

    # Make sure that the columns that are to be put in front
    # are represented as a list.
    if first_columns is None:
        first_columns = list()

    # Sort the columns in alphabetical order.
    sorted_columns = list(df.columns)
    sorted_columns.sort()

    # Move the user-chosen columns to the front.
    for column in first_columns[::-1]:
        move_to_front(column, sorted_columns)

    # Assign the ordered columns to the dataframe.
    df = df[sorted_columns]

    return df


# A function that creates a new column representing data in 'value_columns'
# for every unique value in 'category_columns'.
def df_create_column_for_each_unique_value(df,
                                           category_columns,
                                           value_columns,
                                           aggfunc='first'):

    # Always represent category and value columns as a list or tuple.
    if not isinstance(category_columns, (list, tuple)):
        category_columns = [category_columns]

    if not isinstance(value_columns, (list, tuple)):
        value_columns = [value_columns]

    # Create a colum order for grouping so that all the value columns
    # come last and category columns second last. We leave out 1
    # value column for the result
    cat_and_value_columns = category_columns + value_columns
    column_order = (
        get_df_columns(df, exclude=cat_and_value_columns)
        + cat_and_value_columns[:-1]
    )

    # Create columns from unique values by grouping and unstacking.
    df = (
        df
        .groupby(column_order)
        .first()
        .unstack(list(np.arange(-len(cat_and_value_columns) + 1,
                                0)))
        .reset_index()
    )

    # Delete the names of the index levels
    df = df.rename_axis(['' for level in range(df.columns.nlevels)],
                        axis="columns")
    return df

# OLD IMPLEMENTATION
#     return (
#         df
#         .pivot_table(values=value_columns,
#                      index=get_df_columns(df,
#                                           exclude=category_columns + value_columns),
#                      columns=category_columns,
#                      aggfunc='first')
#         .reset_index()
#         .rename_axis(('', ''), axis="columns")
#     )


# A function that flaattens the multiindex of a dataframe.
def flatten_multiindex(df, axis='columns'):

    # Get the desired index
    if axis in [1, 'columns']:
        index = df.columns
    elif axis in [0, 'rows']:
        index = df.index
    else:
        raise ValueError(f'Invalid axis: "{axis}".')

    # Join all the levels except the empty ones with a ', '
    flat_index = list()
    for element in index.values:
        if not isinstance(element, (tuple, list)):
            flat_index.append(element)
        else:
            flat_element = ''
            for idx, subelement in enumerate(element):
                if subelement:
                    if idx == 0:
                        flat_element += subelement
                    else:
                        flat_element += ', ' + subelement

            flat_index.append(flat_element)

    # Assign the index to the dataframe
    if axis in [1, 'columns']:
        df.columns = flat_index
    elif axis in [0, 'rows']:
        df.index = flat_index

    return df


# A function that creates an empty multiindex of a given depth.
def create_empty_multiindex(n_levels, names=None):
    return pd.MultiIndex.from_arrays(
        arrays=[list() for _ in range(n_levels)],
        names=names
    )