<hr style="border:2px solid black"></hr>

# Initialization

In [None]:
%load_ext autotime
%matplotlib inline

from toolbox.initialize import *


# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

<hr style="border:2px solid black"></hr>

# Cleaning

## Load dataframe

In [None]:
df_path = Config.Path.project_data_root / 'df_clean'
df = t.load_data_from_files(df_path, spark, method='parquet')
df.drop('user_id_prefix')

display_middle_results = True

## Count the rows

In [None]:
n_rows_total = df.count()

print(f'The total number of rows: {n_rows_total:,.0f}.')

## Visualize the amount of undefined cells in each columns

In [None]:
# Initialize the dict where the numbers will be stored.
n_undefined = dict()
pct_undefined = dict()

for column_name in df.columns:
    n_undefined[column_name] = df.where(f.col(column_name).isNull()).count()
    pct_undefined[column_name] = n_undefined[column_name] / n_rows_total * 100
    print(f'Number of undefined cells in column "{column_name}": '
          f'{n_undefined[column_name]} {pct_undefined[column_name]:.2f} %)')


# Create a barplot showing the percentage of undefined values in each column.
axes = sns.barplot(x=list(pct_undefined.keys()), y=list(pct_undefined.values()))

# Rotate the tick labels for the x-axes.
labels = axes.set_xticklabels(axes.get_xticklabels(),
                              rotation=45,
                              ha='right',
                              va='top')

# Annotate the graph.
title = axes.set_title('Percentage of undefined values by column')
# x_label = axes.set_xlabel('Column name')
y_label = axes.set_ylabel('Percentage of undefined values')

# Apply general formatting
w.format_figure(axes.figure)

# figure, axes = w.empty_figure()
# axes.bar()

## Investigate activities over time

In [None]:
# Save the start time for timing.
start_time = time.time()

# Compute dataframe showing progress of Rome popularity per time
# Order by review date.
df_activity_over_time = (
    df_clean
    .groupBy('activity_start_time')
    .count()
    .orderBy(f.asc('activity_start_time'))
    )

# Show the top rows
df_activity_over_time.limit(100).toPandas().head(10)

# Show the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')

In [None]:
# Save the start time for timing.
start_time = time.time()

# Compute dataframe showing progress of Rome popularity per time
# Order by review date.
df_activity_stats = (
    df_clean
    .withColumn('activity_date', f.to_date('activity_start_time'))
    .groupBy('activity_date')
    .agg(f.count('activity_id').alias('n_activities'),
         f.sum('activity_duration').alias('activity_duration_sum'))
    .orderBy(f.asc('activity_date'))
)

# Show the top rows
df_activity_stats.limit(100).toPandas().head(10)

# Show the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')

In [None]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

In [None]:
# # Save the start time for timing.
# start_time = time.time()

df_activity_stats_pd = df_activity_stats.toPandas()

# # Show the execution time.
# print(f'Execution time: {time.time() - start_time:.5f} s.')

In [None]:
import datetime


In [None]:
# Create a figure and axes for the plot.
figure, axes = w.empty_figure()

# Plot Rome popularity over time on the created axes
df_activity_stats_pd.plot(
    kind='line',
    color=sns.color_palette(),
    legend=False,
    ax=axes,
    x='activity_date',
    y=["n_activities"]
 )

# Format the axes
axes.grid()
axes.set_title('Number of activities over time')
axes.set_xlabel('')
axes.set_ylabel('Number of activities')
axes.xaxis.set_major_locator(mdates.YearLocator(1, month=1, day=1))
axes.set_xlim((datetime.datetime(2014, 1, 1), datetime.datetime(2020, 1, 1)))
plot = plt.setp(axes.get_xticklabels(),
                rotation=45,
                ha="right",
                va='top',
                rotation_mode="anchor")

# Show the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')

In [None]:
# Create a figure and axes for the plot.
figure, axes = w.empty_figure()

# Plot Rome popularity over time on the created axes
df_activity_stats_pd.plot(
    kind='line',
    color=sns.color_palette(),
    legend=False,
    ax=axes,
    x='activity_date',
    y=["activity_duration_sum"]
 )

# Format the axes
axes.grid()
axes.set_title('Time spend on activities over time')
axes.set_xlabel('')
axes.set_ylabel('Sum of duration of activities')
axes.xaxis.set_major_locator(mdates.YearLocator(1, month=1, day=1))
axes.set_xlim((datetime.datetime(2014, 1, 1), datetime.datetime(2020, 1, 1)))
plot = plt.setp(axes.get_xticklabels(),
                rotation=45,
                ha="right",
                va='top',
                rotation_mode="anchor")

# Show the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')

In [None]:
# # Histogram
# buckets = [int(10*1.2**i) for i in range(0,30)]
# price_histogram = df_listings_rome.select('price').rdd.flatMap(lambda x: x).histogram(buckets)

def get_column(df, column_name):
    return df_activity_stats.select(column_name).rdd.flatMap(lambda x: x).collect()

date = get_column(df_activity_stats, 'activity_date')
n_activities = get_column(df_activity_stats, 'n_activities')
duration_sum = get_column(df_activity_stats, 'activity_duration_sum')

In [None]:
# Create a figure and axes for the plot.
figure, axes = w.empty_figure()

axes.plot(date, n_activities)

<hr style="border:2px solid black"></hr>

# Save the dataframe

In [None]:
# Save the start time for timing.
start_time = time.time()

# Save the dataframe with partitions defined by the first
# two letters of the user ID.
(df_clean
 .withColumn('user_id_prefix', f.col('user_id').substr(0,2))
 .write.mode('overwrite')
 .partitionBy('user_id_prefix')
 .parquet(str(Config.Path.project_data_root / 'df_clean_new'))
)

# Print the execution time.
print(f'Execution time: {time.time() - start_time:.5f} s.')