<p style="border:2px solid black"> </p>
<span style="font-family:Lucida Bright;">
<p style="margin-bottom:0.8cm"></p>
<center>
<font size="6"><b>Understanding Music Listening Habits</b></font>
<p style="margin-bottom:-0.1cm"></p>
<font size="6"><b>Using Large-scale Smartphone Data</b>  </font>
    
<p style="margin-bottom:0.5cm"></p>
<font size="3"><b>Wojciech Mazurkiewicz, DTU, 14 May 2021</b></font>
<p style="margin-bottom:1cm"></p>
<font size="5"><b>Initial Statistics</b></font>
<br>
<font size="3"><b></b></font>
</center>
<p style="margin-bottom:0.4cm"></p>
<p style="border:2px solid black"> </p>


# Initialization
<hr style="border:2px solid black"></hr>


Please note that the pregenerated cell outputs will not display correctly unless the notebook is **trusted**.

The initializaiton procedure is defined in the notebook [Initialization](initialization.ipynb).

In [None]:
%run initialization.ipynb

# Create a Spark session
<hr style="border:2px solid black"></hr>


In [None]:
#%% Initialize a Spark session.
spark = t.spark.create_session('Music_Activity')

# Set this for faster conversion from Spark to pandas.
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Load clean data
<hr style="border:2px solid black"></hr>


In [None]:
# Define the path to the data.
data_path = Config.Path.music_data_clean_root

# Load the data into a clean dataframe
df_full = t.load_data_from_files(data_path,
                                 spark,
                                 method='parquet')

# Drop the column: "user_id_prefix".
df_full = df_full.drop('user_id_prefix')

# Set the flag that determines whether the results
# will be shown.
display_middle_results = True

# Display the table.
if display_middle_results:
    display(df_full.limit(10).toPandas())

# Turn on caching if desired
<hr style="border:2px solid black"></hr>


In [None]:
# Uncomment to turn on caching.
df_full.cache()

# Get the row count of the full database
<hr style="border:2px solid black"></hr>


In [None]:
# Count the number of rows.
n_rows = df_full.count()

# Display the result.
printmd(f'Number of rows in the the full databse: **{n_rows:,.0f}**')

# Data about the *activities*
<hr style="border:2px solid black"></hr>


## Activity ID

### Number of undefined entries

In [None]:
# Count the number of undefined entries
t.spark.count_undefined_entries(df_full, 'activity_id', verbose='markdown')

## Activity start time

In [None]:
# Count the number of undefined entries
t.spark.count_undefined_entries(df_full, 'activity_start_time', verbose='markdown')

In [None]:
display(
    df_full
    .where(f.col('activity_start_time').isin(['', '<unknown>', '�']))
    .count()
)

## Activity duration

# Data about the *users*
<hr style="border:2px solid black"></hr>


# Data about the *playback devices*
<hr style="border:2px solid black"></hr>


# Data about the *audio tracks*
<hr style="border:2px solid black"></hr>


## Distinct values of `track_player`

In [None]:
df_distinct_track_player = (
    df
    .select('track_player')
    .distinct()
)

n_distinct_track_players = df_distinct_track_player.count()
print(f'The number of distinct track players is: {n_distinct_track_players}')

# Display the table.
if display_middle_results:
    df_distinct_track_player.limit(100).toPandas().head(5)

## Types of `track_uri`

In [None]:
df_types_of_track_uri = (
    df
    .withColumn('track_uri_type',
                f.when(f.length('track_uri') >= 20,
                       f.substring('track_uri', 1, 20))
                .otherwise(f.col('track_uri')))
    .select('track_uri_type')
    .distinct()
)

n_types_of_track_uri = df_types_of_track_uri.count()
print(f'The number of distinct track uri types is: {n_types_of_track_uri}')

In [None]:
# Display the table.
if display_middle_results:
    df_types_of_track_uri.limit(100).toPandas().head(5)

In [None]:
df_track_uri_prefix = (
    df
    .where(f.col('track_uri').isNotNull())
    .withColumn('track_uri_prefix',
                 f.substring_index('track_uri', ':', 1))
    .select(['track_uri_prefix', 'track_uri'])
    .dropDuplicates(['track_uri_prefix'])
)

n_uri_prefix = df_track_uri_prefix.count()
print(f'The number of distinct track uri types is: {n_uri_prefix}')

# Display the table.
if display_middle_results:
    df_track_uri_prefix.limit(100).toPandas().head(5)

In [None]:
dfp_track_uri_prefix = df_track_uri_prefix.limit(10).toPandas()

## Types of `track_id`

In [None]:
df_types_of_track_id = (
    df
    .withColumn('track_id_type',
                f.when(f.length('track_id') >= 7, 
                       f.substring('track_id', 1, 7))
                .otherwise(f.col('track_id')))
    .select(['track_id_type', 'track_id'])
    .dropDuplicates(['track_id_type'])
)

# Count the number of track id's
n_types_of_track_id = df_types_of_track_id.count()
print(f'The number of distinct track uri types is: {n_types_of_track_id}')

In [None]:
# Display the dataframe.
if display_middle_results:
    df_types_of_track_id.limit(100).toPandas().head(5)

## Coappearence of  `track_id` and  `track_uri`

In [None]:
n_track_id_and_uri_nonempty = (
    df
    .where((f.col('track_uri').isNotNull())
           & (f.col('track_id').isNotNull()))
    .count()
)

print(f'Number of rows where both "tracks_id" and '
      f'"tracks_uri" are full: {n_track_id_and_uri_nonempty}.')

## Rows where `track_uri` is a local path and the platform is not Walkman

In [None]:
df_a = (
    df
    .where(f.substring('track_uri', 1, 37) == 'content://media/external/audio/media/')
    .where(f.col('track_player') != 'Walkman')
)

# Display the table.
if display_middle_results:
    df_a.limit(100).toPandas().head(5)


In [None]:
df_b = (
    df
    .where(f.substring('track_uri', 1, 37) == 'content://media/external/audio/media/')
    .where(f.col('track_player') != 'Walkman')
)

# Display the table.
if display_middle_results:
    df_a.limit(100).toPandas().head(5)

In [None]:
df_b = (
    df.withColumn('track_id_prefix', 
                 f.when(f.col('track_uri').isNull()))
    
    .where(f.substring('track_uri', 1, 37) == 'content://media/external/audio/media/')
    .where(f.col('track_player') != 'Walkman')
)

# Display the table.
if display_middle_results:
    df_a.limit(100).toPandas().head(5)

In [None]:
df = (
    df
    .withColumnRenamed('track_id', 'track_spotify_uri')
    .withColumnRenamed('track_uri', 'track_sony_path')
)

## Count the rows

In [None]:
n_rows_total = df.count()

print(f'The total number of rows: {n_rows_total:,.0f}.')

# Statistics
<hr style="border:2px solid black"></hr>


## Visualize the amount of undefined cells in each column

### As table

Count the undefined cells in each column:

In [None]:
## COUNT THE UNDEFINED CELLS

# Initialize the dict where the numbers will be stored.
n_undefined = dict()
pct_undefined = dict()

# Count the undefined cells in each column.
for column_name in tqdm(df.columns):
    n_undefined[column_name] = df.where(f.col(column_name).isNull()).count()
    pct_undefined[column_name] = n_undefined[column_name] / n_rows_total * 100

    
## DISPLAY RESULTS AS PANADAS TABLE

# Define the column names.
property_names = ['Number of cells', 'Percent of total'] 

# Create a dict representing the data.
data = {column_name: [f'{n_undefined[column_name]:,.0f}',
                      f'{pct_undefined[column_name]:.2f}']
        for column_name in n_undefined.keys()}

# Create the pandas dataframe for displaying.
dfp_undefined = pd.DataFrame.from_dict(data,
                                       orient='index',
                                       columns=property_names)

# Display the dataframe
display(dfp_undefined)

In [None]:
t.save_table(dfp_undefined, 'undefined_cells_in_full_database')

### As bargraph

Create a bargraph represening the percentage of undefined cells in each column:

In [None]:
# Create a barplot showing the percentage of undefined values in each column.
axes = sns.barplot(x=list(pct_undefined.keys()),
                   y=list(pct_undefined.values()))

# Rotate the tick labels for the x-axes.
labels = axes.set_xticklabels(axes.get_xticklabels(),
                              rotation=45,
                              ha='right',
                              va='top',
                              fontsize=14
                             )

# Annotate the graph.
title = axes.set_title('Percentage of undefined values by column')
# x_label = axes.set_xlabel('Column name')
y_label = axes.set_ylabel('Percentage of undefined values')

# Apply general formatting.
w.format_figure(axes.figure)

# Save the figure
# t.save_plot('percentage_of_undefined_cells_in_full_database')
plt.savefig(Config.Path.report_images_root / 'percentage_of_undefined_cells_in_full_database.eps')

## Visualize activities over time

In [None]:
# Compute dataframe showing the number of activities and the total duration
# of these activities per day. Order by activity date.
df_activity_stats = (
    df
    .withColumn('activity_date', f.to_date('activity_start_time'))
    .groupBy('activity_date')
    .agg(f.count('activity_id').alias('n_activities'),
         f.sum('activity_duration').alias('activity_duration_sum'))
    .orderBy(f.asc('activity_date'))
)

# Show the top rows
df_activity_stats.limit(100).toPandas().head(10)

In [None]:
# # Histogram
# buckets = [int(10*1.2**i) for i in range(0,30)]
# price_histogram = df_listings_rome.select('price').rdd.flatMap(lambda x: x).histogram(buckets)

def get_column(df, column_name):
    return df.select(column_name).rdd.flatMap(lambda x: x).collect()

date = get_column(df_activity_stats, 'activity_date')
n_activities = get_column(df_activity_stats, 'n_activities')
duration_sum = get_column(df_activity_stats, 'activity_duration_sum')

In [None]:
# Create a figure and axes for the plot.
figure, axes_left = w.empty_figure()

# Create a twin axes for two different y-axes on the same plot.
axes_right = axes_left.twinx()

# Plot the data
plot_left = axes_left.plot(date, np.array(n_activities) / 1e6,
                      color='red',
                      label='Number of daily activities')

plot_right = axes_right.plot(date, np.array(duration_sum) / (3600 * 1e6),
                         color='blue',
                         label='Sum of duration of daily activites')

# Set titles and labels.
title = axes_left.set_title('Daily actvity')
xlabel = axes_left.set_xlabel('')
ylabel_left = axes_left.set_ylabel('Number of activities [millions]')
ylabel_right = axes_right.set_ylabel('Time spent on activities [millions of hours]')

# Set tickmarks.
axes_left.xaxis.set_major_locator(mdates.YearLocator(1, month=1, day=1))
x_lim = axes_left.set_xlim((datetime.datetime(2014, 1, 1), datetime.datetime(2020, 1, 1)))
plot = plt.setp(axes_left.get_xticklabels(),
                rotation=45,
                ha="right",
                va='top',
                rotation_mode="anchor",
                fontsize=14)

# Set limits
y_lim_left = axes_left.set_ylim((0, 5))
y_lim_right = axes_right.set_ylim((0, 5))

# Set layout
axes = figure.axes
for ax in axes:
    ax.spines['top'].set_color('white')
    ax.set_facecolor("white")
    ax.xaxis.label.set_fontsize(18)
    ax.yaxis.label.set_fontsize(18)
    ax.title.set_fontsize(20)
    
axes_left.yaxis.label.set_color('red')
axes_right.yaxis.label.set_color('blue')

# Set grid
axes_left.xaxis.grid(which="both", linewidth=0.5)
axes_left.yaxis.grid(which="both", linewidth=0.5)
axes_right.grid(None)

# Add a legend
all_plots = plot_left + plot_right
all_labels = [plot.get_label() for plot in all_plots]
legend = axes_left.legend(all_plots, all_labels, loc=0)

# Save the figure
# t.save_plot('daily_activity_vs_time')
figure.savefig(Config.Path.report_images_root / 'daily_activity_vs_time.eps')

# Save the dataframe
<hr style="border:2px solid black"></hr>

In [None]:
# Save the start time for timing.
start_time = time.time()

# Save the dataframe with partitions defined by the first
# two letters of the user ID.
(df
 .withColumn('user_id_prefix', f.col('user_id').substr(0,2))
 .write.mode('overwrite')
 .partitionBy('user_id_prefix')
 .parquet(str(Config.Path.project_data_root / 'df_clean_new'))
)

# Print the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')