<hr style="border:2px solid black"></hr>

# Initialization

In [1]:
%load_ext autotime
%matplotlib inline

from toolbox.initialize import *

# Set this for faster conversion to pandas
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

Creating a Spark session.
	Execution time: 110.96129 s.
Getting the data paths.
	Number of folders: 62
	Number of files: 275621
Loading data from path: "/data/work/src/musicactivity".
	Execution time: 10.01333 s.
Loading data from path: "/data/work/shared/s001284/Music_Project/resources/data/df_sample_raw_1E6.parquet".
	Execution time: 1.49188 s.


<hr style="border:2px solid black"></hr>

# Cleaning

## Load dataframe

In [2]:
df_path = Config.Path.project_data_root / 'df_clean'
df = t.load_data_from_files(df_path, spark, method='parquet')

# Drop the column: "user_id_prefix".
df = df.drop('user_id_prefix')

# Set the flag that determines whether the middle results
# will be shown.
display_middle_results = True

# Display the table.
if display_middle_results:
    df.limit(100).toPandas().head(5)

Loading data from path: "/data/work/shared/s001284/Music_Project/resources/data/df_clean".
	Execution time: 0.86490 s.


Unnamed: 0,user_id,activity_id,activity_start_time,activity_duration,device_id,device_name,track_artist,track_title,track_album,track_player,track_start_time,track_duration,track_id,track_uri
0,b64429b5-59a1-4398-9276-6dc400815f8e,41aaa6e7-2c79-4c18-822f-77ec9dfa56a3-2016-07-09,2016-07-09 04:14:19.157,5056,771c4550615a05cd,SOV32,UNISON SQUARE GARDEN,シャンデリア・ワルツ,UNISON SQUARE GARDEN,Walkman,2016-07-09 04:48:46.629,348,,content://media/external/audio/media/102343
1,b6206ded-5249-43ae-96fe-6cede3baca09,35183207-c20f-4032-82bc-21c92b3f06cb-2015-10-01,2015-10-01 14:43:32.389,1305,263729c6c47127d4,D6603,Maluma,Borro Cassette,Borro Cassette,Spotify,2015-10-01 14:43:32.389,216,spotify:track:2ocDZVhBiGCaaZRNW3fmSd,
2,b64429b5-59a1-4398-9276-6dc400815f8e,41aaa6e7-2c79-4c18-822f-77ec9dfa56a3-2016-07-09,2016-07-09 04:14:19.157,5056,771c4550615a05cd,SOV32,UNISON SQUARE GARDEN,シュガーソングとビターステップ,UNISON SQUARE GARDEN,Walkman,2016-07-09 04:54:34.187,257,,content://media/external/audio/media/102344
3,b6206ded-5249-43ae-96fe-6cede3baca09,35183207-c20f-4032-82bc-21c92b3f06cb-2015-10-01,2015-10-01 14:43:32.389,1305,263729c6c47127d4,D6603,Reykon,Imaginándote (feat. Daddy Yankee),Imaginándote (feat. Daddy Yankee),Spotify,2015-10-01 14:47:08.233,224,spotify:track:3AQ1zVx39FvB4yH0bjGgPm,
4,b64429b5-59a1-4398-9276-6dc400815f8e,41aaa6e7-2c79-4c18-822f-77ec9dfa56a3-2016-07-09,2016-07-09 04:14:19.157,5056,771c4550615a05cd,SOV32,UNISON SQUARE GARDEN,流星のスコール,UNISON SQUARE GARDEN,Walkman,2016-07-09 04:59:12.033,307,,content://media/external/audio/media/102337


time: 49.5 s


## Count the rows

In [None]:
n_rows_total = df.count()

print(f'The total number of rows: {n_rows_total:,.0f}.')

## Visualize the amount of undefined cells in each column

Count the undefined cells in each column:

In [None]:
## COUNT THE UNDEFINED CELLS

# Initialize the dict where the numbers will be stored.
n_undefined = dict()
pct_undefined = dict()

# Count the undefined cells in each column.
for column_name in tqdm(df.columns):
    n_undefined[column_name] = df.where(f.col(column_name).isNull()).count()
    pct_undefined[column_name] = n_undefined[column_name] / n_rows_total * 100

    
## DISPLAY RESULTS AS PANADAS TABLE

# Define the column names.
property_names = ['Number of cells', 'Percent of total'] 

# Create a dict representing the data.
data = {column_name: [f'{n_undefined[column_name]:,.0f}',
                      f'{pct_undefined[column_name]:.2f}']
        for column_name in n_undefined.keys()}

# Create the pandas dataframe for displaying.
dfp_undefined = pd.DataFrame.from_dict(data,
                                       orient='index',
                                       columns=property_names)

# Display the dataframe
display(dfp_undefined)



Create a bargraph represening the percentage of undefined cells in each column:

In [None]:
# Create a barplot showing the percentage of undefined values in each column.
axes = sns.barplot(x=list(pct_undefined.keys()),
                   y=list(pct_undefined.values()))

# Rotate the tick labels for the x-axes.
labels = axes.set_xticklabels(axes.get_xticklabels(),
                              rotation=45,
                              ha='right',
                              va='top',
                              fontsize=14
                             )

# Annotate the graph.
title = axes.set_title('Percentage of undefined values by column')
# x_label = axes.set_xlabel('Column name')
y_label = axes.set_ylabel('Percentage of undefined values')

# Apply general formatting.
w.format_figure(axes.figure)

axes.figure.savefig(Config.Path.report_images_root, 'percentage_of_undefined_cells_in_full')

## Visualize activities over time

In [None]:
# Compute dataframe showing the number of activities and the total duration
# of these activities per day. Order by activity date.
df_activity_stats = (
    df
    .withColumn('activity_date', f.to_date('activity_start_time'))
    .groupBy('activity_date')
    .agg(f.count('activity_id').alias('n_activities'),
         f.sum('activity_duration').alias('activity_duration_sum'))
    .orderBy(f.asc('activity_date'))
)

# Show the top rows
df_activity_stats.limit(100).toPandas().head(10)

In [None]:
# # Histogram
# buckets = [int(10*1.2**i) for i in range(0,30)]
# price_histogram = df_listings_rome.select('price').rdd.flatMap(lambda x: x).histogram(buckets)

def get_column(df, column_name):
    return df_activity_stats.select(column_name).rdd.flatMap(lambda x: x).collect()

date = get_column(df_activity_stats, 'activity_date')
n_activities = get_column(df_activity_stats, 'n_activities')
duration_sum = get_column(df_activity_stats, 'activity_duration_sum')

In [None]:
# Create a figure and axes for the plot.
figure, axes_left = empty_figure()

# Create a twin axes for two different y-axes on the same plot.
axes_right = axes_left.twinx()

# Plot the data
plot_left = axes_left.plot(date, np.array(n_activities) / 1e6,
                      color='red',
                      label='Number of daily activities')

plot_right = axes_right.plot(date, np.array(duration_sum) / (3600 * 1e6),
                         color='blue',
                         label='Sum of duration of daily activites')


# Set titles and labels.
title = axes_left.set_title('Daily actvity')
xlabel = axes_left.set_xlabel('')
ylabel_left = axes_left.set_ylabel('Number of activities [millions]')
ylabel_right = axes_right.set_ylabel('Time spent on activities [millions of hours]')

# Set tickmarks.
axes_left.xaxis.set_major_locator(mdates.YearLocator(1, month=1, day=1))
x_lim = axes_left.set_xlim((datetime.datetime(2014, 1, 1), datetime.datetime(2020, 1, 1)))
plot = plt.setp(axes_left.get_xticklabels(),
                rotation=45,
                ha="right",
                va='top',
                rotation_mode="anchor",
                fontsize=14)

# Set limits
y_lim_left = axes_left.set_ylim((0, 5))
y_lim_right = axes_right.set_ylim((0, 5))

# Set layout
axes = figure.axes
for ax in axes:
    ax.spines['top'].set_color('white')
    ax.set_facecolor("white")
    ax.xaxis.label.set_fontsize(18)
    ax.yaxis.label.set_fontsize(18)
    ax.title.set_fontsize(20)

# Set grid
axes_left.xaxis.grid(which="both", linewidth=0.5)
axes_left.yaxis.grid(which="both", linewidth=0.5)
axes_right.grid(None)

# Add a legend
all_plots = plot_left + plot_right
all_labels = [plot.get_label() for plot in all_plots]
legend = axes_left.legend(all_plots, all_labels, loc=0)

# Save the figure
figure.savefig(Config.Path.)

<hr style="border:2px solid black"></hr>

# Save the dataframe

In [None]:
# Save the start time for timing.
start_time = time.time()

# Save the dataframe with partitions defined by the first
# two letters of the user ID.
(df
 .withColumn('user_id_prefix', f.col('user_id').substr(0,2))
 .write.mode('overwrite')
 .partitionBy('user_id_prefix')
 .parquet(str(Config.Path.project_data_root / 'df_clean_new'))
)

# Print the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')