In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cbook as cb
import matplotlib.cm as cm
from matplotlib.ticker import FuncFormatter
from tabulate import tabulate
import seaborn as sns
from sklearn.linear_model import LinearRegression
import matplotlib.dates as mdates
import calendar
import matplotlib.ticker as ticker
from scipy import stats

In [None]:
#matplotlib.rc_file_defaults()
mpl.rcParams.update(mpl.rcParamsDefault)
# Set the font family to Arial
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Arial']

dpi = 500
font_title = 18
font_label = 16
font_chart = 14
weight_label = 'bold'
weight_title = 'bold'
pad_label = 10
alpha = 0.7

In [None]:
# Modify y-axis tick labels using StrMethodFormatter
def millions_formatter(x, pos):
    if x >= 1e6:
        return f'{int(x / 1e6)}'
    else:
        return f'{x:.0f}'  # Display as integer without decimal places
formatter = ticker.FuncFormatter(millions_formatter)

In [None]:
csv = "combined_data_MP_NE_mappedUCOtoMTC_Coord_dT_fE_mergedClusterInside_cC.csv"
df = pd.read_csv(csv)
df

In [None]:
filtered_df = df[(df['StartId'].apply(len) < 11) | (df['EndId'].apply(len) < 11)]
filtered_df

In [None]:
df.isna().any().sum()

In [None]:
df.dtypes

# correlation columns to count

In [None]:
# Group the data by 'daytime' and calculate the sum and mean of 'x' for each group
grouped_data = df.groupby('daytime')['count_corrected'].agg(['sum', 'mean'])

# Create a bar plot to show the sum and mean of 'x' per daytime
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

# Plot the sum of 'x' per daytime
plt.scatter(grouped_data.index, grouped_data['sum'], marker='o', label='Sum of count')

plt.xlabel('Daytime', fontsize=font_label)
plt.ylabel('Value', fontsize=font_label)
plt.title('Sum and Mean of x per Daytime', fontsize=font_title)
plt.legend(fontsize=font_chart)

plt.show()

In [None]:
# Plot the mean of 'x' per daytime using a different marker
plt.scatter(grouped_data.index, grouped_data['mean'], marker='s', color='orange', label='Mean of count')

plt.xlabel('Daytime', fontsize=font_label)
plt.ylabel('Value', fontsize=font_label)
plt.title('Sum and Mean of x per Daytime', fontsize=font_title)
plt.legend(fontsize=font_chart)

plt.show()

In [None]:
df.corr()

In [None]:
contains_nan = df['count_corrected'].isnull().any()
contains_nan

In [None]:
df = df.drop(['count'], axis=1)
df.rename(columns={'count_corrected':'count'}, inplace=True)
df.columns

In [None]:
contains_nan = df['count'].isnull().any()
contains_nan

In [None]:
value_counts = df['count'].value_counts().sort_index()
value_counts = value_counts.tolist()
value_counts

In [None]:
count_values = df["count"].unique()
count_values.sort()
count_values

In [None]:
df_count = pd.DataFrame()
df_count["count"] = count_values
df_count["countOccurrences"] = value_counts
df_count["totalCount"] = df_count["count"] * df_count["countOccurrences"]
df_count

In [None]:
j = 0
for i in range(1, 241, 1):
    j += 1
print(j)

In [None]:
# Create the first histogram in subplot 1
plt.subplot(1, 2, 1)
plt.hist(df["count"], bins=50) # density=True | Laura 63 bins | erweitert 50 bins
plt.xlabel('Value of Count')
plt.ylabel('Number of Occurrences [million]')
# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.title('Histogram of count')

# Create the second histogram in subplot 2
plt.subplot(1, 2, 2)
plt.hist(df["count"], bins=50, weights=df["count"]) # density=True | Laura 63 bins | erweitert 50 bins
plt.xlabel('count')
plt.ylabel('Frequency [million]')
# Remove y-axis offset (1e7 on the top left corner)
plt.gca().yaxis.set_major_formatter(lambda x, pos: f'{x / 1e6:.1f}')
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.title('Histogram of count [adjusted]')

# Adjust the layout of subplots
plt.tight_layout()

# Display the plots
plt.show()

In [None]:
whis_val = 1.5

# Create the first box plot in subplot 1
plt.subplot(1, 2, 1)
data = df["count"]
plt.boxplot(data, whis=whis_val)
stats_plt1 = cb.boxplot_stats(data, whis=whis_val)
stats_plt1[0][".90"] = np.percentile(data, 90)
stats_plt1[0][".95"] = np.percentile(data, 95)
stats_plt1[0][".99"] = np.percentile(data, 99)

plt.xlabel('count')
plt.ylabel('Values')
plt.title('Box Plot of count')

# Create duplicate data points based on weights
duplicated_values = np.repeat(df["count"], df["count"])

# Create the second box plot in subplot 2
plt.subplot(1, 2, 2)
plt.boxplot(duplicated_values, whis=whis_val)
stats_plt2 = cb.boxplot_stats(duplicated_values, whis=whis_val)
stats_plt2[0][".90"] = np.percentile(duplicated_values, 90)
stats_plt2[0][".95"] = np.percentile(duplicated_values, 95)
stats_plt2[0][".99"] = np.percentile(duplicated_values, 99)
plt.xlabel('count')
plt.ylabel('Values')
plt.title('Box Plot of count [adjusted]')

# Adjust the layout of subplots
plt.tight_layout()

# Display the plots
plt.show()

In [None]:
head = {}
stats_plt1[0].pop('fliers')
stats_plt2[0].pop('fliers')
print(tabulate(stats_plt1, headers=head, tablefmt="grid"))
print(tabulate(stats_plt2, headers=head, tablefmt="grid"))

In [None]:
description = df['count'].describe()
description = description.round(2)
description

In [None]:
df.groupby('weekday_number')['count'].sum()

# Count of Trips per Weekday

In [None]:
# Group the DataFrame by 'weekday_number' and calculate the total count within each group
total_counts_per_weekday = df.groupby('weekday_number')['count'].sum()

# Get the unique weekday_numbers in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())

# Plot a histogram for each weekday_number
plt.figure(figsize=(10, 8))  # Adjust the figure size if needed

bars = plt.bar(weekday_numbers, total_counts_per_weekday, alpha=alpha)
plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart, ha='center', va='top')

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 3)), ha='center', va='bottom', fontsize=font_chart)

plt.title('Trip Count per Weekday', fontsize=font_title, weight=weight_title)
plt.ylim(0, max(total_counts_per_weekday) * 1.10)
plt.gca().yaxis.set_major_formatter(formatter)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Weekday.png', dpi=dpi)

plt.show()


## Count of Trips per Weekday (Holiday true or false | absolute)

In [None]:
# Group the DataFrame by 'isSchoolHoliday' and 'weekday_number', and calculate the total count within each group
total_counts_per_weekday_schoolHoliday = df.groupby(['isSchoolHoliday', 'weekday_number'])['count'].sum().unstack()

# Get the unique weekday_numbers in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())

# Create a single bar chart
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

# Bar chart for isSchoolHoliday = False
plt.bar([day - 0.2 for day in weekday_numbers], total_counts_per_weekday_schoolHoliday.loc[False], width=0.4, alpha=alpha, label='School')
# Bar chart for isSchoolHoliday = True
plt.bar([day + 0.2 for day in weekday_numbers], total_counts_per_weekday_schoolHoliday.loc[True], width=0.4, alpha=alpha, label='School Holidays')

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e0 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar for isSchoolHoliday = False
for i, bar in enumerate(plt.gca().patches[:7]):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 3)), ha='center', va='bottom', fontsize=font_chart)

# Add y-values on top of each bar for isSchoolHoliday = True
for i, bar in enumerate(plt.gca().patches[7:]):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 3)), ha='center', va='bottom', fontsize=font_chart)

# Show the legend
plt.legend(fontsize=font_chart)
plt.gca().yaxis.set_major_formatter(formatter)
plt.ylim(0, 8.5e6)

plt.title('Trip Count during School and School Holidays', fontsize=font_title, weight=weight_title)

# Save the plot as a PNG image (if needed)
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Weekday_isSchoolHoliday.png', dpi=dpi)

plt.show()


In [None]:
# Group the DataFrame by 'isSchoolHoliday' and 'weekday_number', and calculate the total count within each group
total_counts_per_weekday_schoolHoliday = df.groupby(['isSchoolHoliday', 'weekday_number'])['count'].sum().unstack()

# Calculate the total count per isSchoolHoliday
total_counts_per_week = df.groupby(['isSchoolHoliday'])['count'].sum()

# Normalize the counts by dividing by the total count per isSchoolHoliday for each school holiday status
percentage_counts_per_weekday_schoolHoliday = total_counts_per_weekday_schoolHoliday.div(total_counts_per_week.values, axis=0) * 100

# Get the unique weekday_numbers in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())

# Create a single bar chart
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

# Bar chart for isSchoolHoliday = False
plt.bar([day - 0.2 for day in weekday_numbers], percentage_counts_per_weekday_schoolHoliday.loc[False], width=0.4, alpha=alpha, label='School')
# Bar chart for isSchoolHoliday = True
plt.bar([day + 0.2 for day in weekday_numbers], percentage_counts_per_weekday_schoolHoliday.loc[True], width=0.4, alpha=alpha, label='School Holidays')

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Percentage of Count [%]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e0 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar for isSchoolHoliday = False
for i, bar in enumerate(plt.gca().patches[:7]):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f"{height:.2f}", ha='center', va='bottom', fontsize=font_chart)

# Add y-values on top of each bar for isSchoolHoliday = True
for i, bar in enumerate(plt.gca().patches[7:]):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f"{height:.2f}", ha='center', va='bottom', fontsize=font_chart)

# Show the legend
plt.legend(fontsize=font_chart)

plt.title('Percentage of Trip Count during School and School Holidays', fontsize=font_title, weight=weight_title)
plt.ylim(0, 18.5)

# Save the plot as a PNG image (if needed)
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Percentage_of_Total_Count_of_Trips_per_Weekday_isSchoolHoliday.png', dpi=dpi)

plt.show()

In [None]:
# Group the DataFrame by 'isSchoolHoliday' and 'weekday_number', and calculate the total count within each group
total_counts_per_weekday_schoolHoliday = df.groupby(['isSchoolHoliday', 'weekday_number'])['count'].sum().unstack()

# Calculate the total count per isSchoolHoliday
total_counts_per_week = df.groupby(['isSchoolHoliday'])['count'].sum()

# Normalize the counts by dividing by the total count per isSchoolHoliday for each school holiday status
percentage_counts_per_weekday_schoolHoliday = total_counts_per_weekday_schoolHoliday.div(total_counts_per_week.values, axis=0) * 100

# Calculate the relative difference in percentage between isSchoolHoliday = True and False for each weekday
relative_difference_per_weekday = ((percentage_counts_per_weekday_schoolHoliday.loc[True] * 100/ percentage_counts_per_weekday_schoolHoliday.loc[False]) - 1) - 100

# Get the unique weekday_numbers in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())

# Create a bar chart to show the relative difference in percentage
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed

plt.bar(weekday_numbers, relative_difference_per_weekday, width=0.8, alpha=alpha, label='Change from School Holidays to No School Holidays')

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Relative Difference in Count [%]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e0 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add values on top of each bar
for i, bar in enumerate(plt.gca().patches):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f"{height:.2f}", ha='center', va='bottom', fontsize=font_chart)

plt.title('Change of Count Distribution per Weekday from School to School Holidays', fontsize=font_title, weight=weight_title)
plt.ylim(-18, 18)

# Save the plot as a PNG image (if needed)
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__ChangeOfCountDistribution_perWeekday_isSchoolHoliday.png', dpi=dpi)

plt.show()

In [None]:
# Group the DataFrame by 'isSchoolHoliday' and 'weekday_number', and calculate the total count within each group
total_counts_per_weekday_schoolHoliday = df.groupby(['isSchoolHoliday', 'weekday_number'])['count'].sum().unstack()

# Calculate the total count per isSchoolHoliday
total_counts_per_week = df.groupby(['isSchoolHoliday'])['count'].sum()

# Normalize the counts by dividing by the total count per isSchoolHoliday for each school holiday status
percentage_counts_per_weekday_schoolHoliday = total_counts_per_weekday_schoolHoliday.div(total_counts_per_week.values, axis=0) * 100

# Calculate the relative difference in percentage between isSchoolHoliday = True and False for each weekday
relative_difference_per_weekday = ((percentage_counts_per_weekday_schoolHoliday.loc[True] * 100/ percentage_counts_per_weekday_schoolHoliday.loc[False]) - 1) - 100

# Get the unique weekday_numbers in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())

# Create a bar chart to show the relative difference in percentage
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed

plt.bar(weekday_numbers, relative_difference_per_weekday, width=0.8, alpha=alpha, label='Change from School Holidays to No School Holidays')

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Relative Difference in Count [%]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e0 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add values on top of each bar
for i, bar in enumerate(plt.gca().patches):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f"{height:.2f}", ha='center', va='bottom', fontsize=font_chart)

plt.title('Change of Count Distribution per Weekday from School to School Holidays', fontsize=font_title, weight=weight_title)


# Perform t-test for each weekday
p_values = []
for weekday_number in weekday_numbers:
    isSchoolHoliday_true = df[(df['isSchoolHoliday'] == True) & (df['weekday_number'] == weekday_number)]['count']
    isSchoolHoliday_false = df[(df['isSchoolHoliday'] == False) & (df['weekday_number'] == weekday_number)]['count']
    _, p_value = stats.ttest_ind(isSchoolHoliday_true, isSchoolHoliday_false)
    p_values.append(p_value)

# Print p-values
for weekday, p_value in zip(weekday_numbers, p_values):
    print(f"Weekday: {weekday_mapping[weekday]}, p-value: {p_value}")

# Save the plot as a PNG image (if needed)
#plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__ChangeOfCountDistribution_perWeekday_isSchoolHoliday.png', dpi=dpi)

plt.show()

### Average Count per Trip

In [None]:
# Group the DataFrame by 'isSchoolHoliday' and 'weekday_number', and calculate the average count within each group
average_counts_per_weekday_schoolHoliday = df.groupby(['isSchoolHoliday', 'weekday_number'])['count'].mean().unstack()

# Get the unique weekday_numbers in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())

# Create a single bar chart
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed

# Bar chart for isSchoolHoliday = False
plt.bar([day - 0.2 for day in weekday_numbers], average_counts_per_weekday_schoolHoliday.loc[False], width=0.4, alpha=alpha, label='No School Holidays')
# Bar chart for isSchoolHoliday = True
plt.bar([day + 0.2 for day in weekday_numbers], average_counts_per_weekday_schoolHoliday.loc[True], width=0.4, alpha=alpha, label='School Holidays')

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Average Count per Trip', fontsize=font_label, weight=weight_label, labelpad=pad_label)
#plt.title('Average Count of Trips per Weekday', fontsize=14)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e0 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar for isSchoolHoliday = False
for i, bar in enumerate(plt.gca().patches[:7]):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(height, 2)), ha='center', va='bottom', fontsize=font_chart)

# Add y-values on top of each bar for isSchoolHoliday = True
for i, bar in enumerate(plt.gca().patches[7:]):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(height, 2)), ha='center', va='bottom', fontsize=font_chart)

# Show the legend
plt.legend(fontsize=font_chart, loc='lower right')

plt.title('Comparison of Average Count per Trip during School and School Holidays', fontsize=font_title, weight=weight_title)

# Save the plot as a PNG image (if needed)
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Average_Count_per_Trip_per_Weekday_isSchoolHoliday.png', dpi=dpi)

plt.show()


## Count of Trips per Weekday per Daytime | isSchoolHoliday = False

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=False
filtered_df = df[df['isSchoolHoliday'] == False]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime = filtered_df.groupby(['weekday_number', 'daytime'])['count'].sum()

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a histogram for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        total_counts_per_weekday_daytime.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label+6, weight=weight_label, labelpad=pad_label)
plt.ylabel('Number of Trips', fontsize=font_label+6, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart+6)

# Modify y axis
plt.yticks(fontsize=font_chart+6)
plt.gca().get_yaxis().set_major_formatter(lambda x, p: format(int(x), ','))  # Format y-axis labels with commas

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim(weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.07)
plt.ylim(0, 1e6)

# Show the legend
plt.legend(fontsize=font_chart+2, title='Daytime', loc='upper right')

plt.title('Number of Trips per Daytime and Weekday during School', fontsize=font_title+6, weight=weight_title)
plt.tight_layout()
# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Weekday_per_Daytime_isSchoolHoliday_False.png', dpi=dpi)

plt.show()

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=False
filtered_df = df[df['isSchoolHoliday'] == False]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime = filtered_df.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime = total_counts_per_weekday_daytime / total_counts_per_weekday_daytime.groupby('weekday_number').transform('sum')

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a histogram for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        normalized_counts_per_weekday_daytime.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label+6, weight='bold', labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label+6, weight='bold', labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart+6)

# Modify y axis
plt.yticks(fontsize=font_chart+6)
plt.gca().yaxis.set_major_formatter('{:.0%}'.format)  # Format y-axis labels as percentages

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim(weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.07)

# Show the legend
plt.legend(fontsize=font_chart+2, title='Daytime', loc='upper right')
plt.tight_layout()
# Save the plot as a PNG image
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Normalized_Count_of_Trips_per_Weekday_per_Daytime_isSchoolHoliday_False.png', dpi=dpi)

plt.show()

## Count of Trips per Weekday per Daytime | isSchoolHoliday = False

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=True
filtered_df = df[df['isSchoolHoliday'] == True]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime = filtered_df.groupby(['weekday_number', 'daytime'])['count'].sum()

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a histogram for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        total_counts_per_weekday_daytime.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label+6, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips', fontsize=font_label+6, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart+6)

# Modify y axis
plt.yticks(fontsize=font_chart+6)
plt.gca().get_yaxis().set_major_formatter(lambda x, p: format(int(x), ','))  # Format y-axis labels with commas

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim(weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.07)
plt.ylim(0, 1.8e5)

# Show the legend
plt.legend(fontsize=font_chart+2, title='Daytime', loc='upper right')

plt.title('Count of Trips per Daytime and Weekday during School Holidays', fontsize=font_title+6, weight=weight_title)
plt.tight_layout()
# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Weekday_per_Daytime_isSchoolHoliday_True.png', dpi=dpi)

plt.show()

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=True
filtered_df = df[df['isSchoolHoliday'] == True]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime = filtered_df.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime = total_counts_per_weekday_daytime / total_counts_per_weekday_daytime.groupby('weekday_number').transform('sum')

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a histogram for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        normalized_counts_per_weekday_daytime.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label, weight='bold', labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label, weight='bold', labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)
plt.gca().yaxis.set_major_formatter('{:.0%}'.format)  # Format y-axis labels as percentages

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim(weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.07)

# Show the legend
plt.legend(fontsize=font_chart, title='Daytime', loc='upper right')

# Save the plot as a PNG image
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Normalized_Count_of_Trips_per_Weekday_per_Daytime_isSchoolHoliday_True.png', dpi=dpi)

plt.show()

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=True
filtered_df_school_holiday = df[df['isSchoolHoliday'] == True]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime_school_holiday = filtered_df_school_holiday.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime_school_holiday = total_counts_per_weekday_daytime_school_holiday / total_counts_per_weekday_daytime_school_holiday.groupby('weekday_number').transform('sum')

# Filter the DataFrame to include only isSchoolHoliday=False
filtered_df_non_school_holiday = df[df['isSchoolHoliday'] == False]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime_non_school_holiday = filtered_df_non_school_holiday.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime_non_school_holiday = total_counts_per_weekday_daytime_non_school_holiday / total_counts_per_weekday_daytime_non_school_holiday.groupby('weekday_number').transform('sum')

# Calculate the difference based on the change from isSchoolHoliday=False to isSchoolHoliday=True
difference = normalized_counts_per_weekday_daytime_school_holiday - normalized_counts_per_weekday_daytime_non_school_holiday

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a bar chart for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        difference.xs(daytime_category, level='daytime') * 100,
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label, weight='bold', labelpad=pad_label)
plt.ylabel('Change in Percentage Points', fontsize=font_label, weight='bold', labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Define a function to format the y-axis ticks with "+" or "-"
def format_y_ticks(value, pos):
    if value == 0:
        return f'{int(value)}'
    if value > 0:
        return f'+{int(value)}'
    else:
        return f'{int(value)}'

# Apply the custom formatter to the y-axis ticks
formatter = FuncFormatter(format_y_ticks)
plt.gca().yaxis.set_major_formatter(formatter)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim(weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.09)

# Show the legend
plt.legend(fontsize=font_chart, title='Daytime', loc='upper right')

# Save the plot as a PNG image
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Change_in_Percentage_Points_of_Trips_per_Weekday_per_Daytime.png', dpi=dpi)

plt.show()

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=True
filtered_df_school_holiday = df[df['isSchoolHoliday'] == True]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime_school_holiday = filtered_df_school_holiday.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime_school_holiday = total_counts_per_weekday_daytime_school_holiday / total_counts_per_weekday_daytime_school_holiday.groupby('weekday_number').transform('sum')

# Filter the DataFrame to include only isSchoolHoliday=False
filtered_df_non_school_holiday = df[df['isSchoolHoliday'] == False]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime_non_school_holiday = filtered_df_non_school_holiday.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime_non_school_holiday = total_counts_per_weekday_daytime_non_school_holiday / total_counts_per_weekday_daytime_non_school_holiday.groupby('weekday_number').transform('sum')

# Calculate the relative difference as a percentage based on the change from isSchoolHoliday=False to isSchoolHoliday=True
relative_difference_percentage = ((normalized_counts_per_weekday_daytime_school_holiday - normalized_counts_per_weekday_daytime_non_school_holiday) / normalized_counts_per_weekday_daytime_non_school_holiday) * 100

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a bar chart for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        relative_difference_percentage.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label, weight='bold', labelpad=pad_label)
plt.ylabel('Relative Difference [%]', fontsize=font_label, weight='bold', labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim((weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2) * 1.15, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.09)

# Show the legend
plt.legend(fontsize=font_chart, title='Daytime', loc='upper right')

plt.title('Change of Trip Count during School Holidays', fontsize=font_title, weight=weight_title)


# Save the plot as a PNG image
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Relative_Difference_in_Percentage_of_Trips_per_Weekday_per_Daytime.png', dpi=dpi)

plt.show()

In [None]:
clip = 100

# Filter the DataFrame to include only isSchoolHoliday=True
filtered_df_school_holiday = df[df['isSchoolHoliday'] == True]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime_school_holiday = filtered_df_school_holiday.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime_school_holiday = total_counts_per_weekday_daytime_school_holiday / total_counts_per_weekday_daytime_school_holiday.groupby('weekday_number').transform('sum')

# Filter the DataFrame to include only isSchoolHoliday=False
filtered_df_non_school_holiday = df[df['isSchoolHoliday'] == False]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the total count within each group
total_counts_per_weekday_daytime_non_school_holiday = filtered_df_non_school_holiday.groupby(['weekday_number', 'daytime'])['count'].sum()

# Normalize the count for each daytime of each day to be 100%
normalized_counts_per_weekday_daytime_non_school_holiday = total_counts_per_weekday_daytime_non_school_holiday / total_counts_per_weekday_daytime_non_school_holiday.groupby('weekday_number').transform('sum')

# Calculate the relative difference as a percentage based on the change from isSchoolHoliday=False to isSchoolHoliday=True
relative_difference_percentage = ((normalized_counts_per_weekday_daytime_school_holiday - normalized_counts_per_weekday_daytime_non_school_holiday) / normalized_counts_per_weekday_daytime_non_school_holiday) * 100

# Clip the relative difference percentage to a maximum of 200%
relative_difference_percentage = np.clip(relative_difference_percentage, -clip, clip)

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a bar chart for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.04  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        relative_difference_percentage.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label, weight='bold', labelpad=pad_label)
plt.ylabel('Relative Difference [%]', fontsize=font_label, weight='bold', labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim((weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2) * 1.15, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.09)

plt.title(f'Change of Trip Count during School Holidays (clipped to {clip})', fontsize=font_title, weight=weight_title)

# Show the legend
plt.legend(fontsize=font_chart, title='Daytime', loc='upper right')

# Save the plot as a PNG image
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + f'__Relative_Difference_in_Percentage_of_Trips_per_Weekday_per_Daytime_clippedTo{clip}.png', dpi=dpi)

plt.show()

## Distance per Daytime

### isSchoolHolidays==False

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=False
filtered_df = df[df['isSchoolHoliday'] == False]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the mean distance within each group
mean_distance_per_weekday_daytime = filtered_df.groupby(['weekday_number', 'daytime'])['distance'].mean()

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a histogram for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.0375  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        mean_distance_per_weekday_daytime.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Mean Distance of Trips [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)
plt.gca().get_yaxis().set_major_formatter(lambda x, p: format(int(x), ','))  # Format y-axis labels with commas

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim((weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2) * 1.2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.09)

plt.ylim(0, 18)

# Show the legend
plt.legend(fontsize=font_chart, title='Daytime', loc='upper right')

plt.title('Mean Distance of Trips during School', fontsize=font_title, weight=weight_title)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Mean_Distance_of_Trips_per_Weekday_per_Daytime_isSchoolHoliday_False.png', dpi=dpi)

plt.show()

isSchoolHoliday = True

In [None]:
# Filter the DataFrame to include only isSchoolHoliday=True
filtered_df_school_holiday = df[df['isSchoolHoliday'] == True]

# Group the filtered DataFrame by 'weekday_number' and 'daytime' and calculate the mean distance within each group
mean_distance_per_weekday_daytime_school_holiday = filtered_df_school_holiday.groupby(['weekday_number', 'daytime'])['distance'].mean()

# Get the unique weekday_numbers and daytime in the order you want them to appear on the x-axis
weekday_numbers = sorted(df['weekday_number'].unique())
daytime_categories = sorted(df['daytime'].unique())

# Define a color palette with more distinct colors for each daytime category
color_palette = plt.cm.tab20c.colors

# Plot a histogram for each weekday_number and split by daytime
plt.figure(figsize=(20, 10))  # Adjust the figure size if needed

bar_width = 0.0375  # Adjust the width of the bars here

for i, daytime_category in enumerate(daytime_categories):
    bars = plt.bar(
        [day + bar_width * (i - (len(daytime_categories) - 1) / 2) for day in weekday_numbers],
        mean_distance_per_weekday_daytime_school_holiday.xs(daytime_category, level='daytime'),
        alpha=alpha,
        width=bar_width,  # Set the width of the bars
        label=daytime_category,
        color=color_palette[i % len(color_palette)],  # Assign distinct colors to each daytime category
    )

plt.xlabel('Weekday', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Mean Distance of Trips [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Mapping weekday numbers to weekdays on the x-axis
weekday_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday/Holiday'}
plt.xticks(weekday_numbers, [weekday_mapping[day] for day in weekday_numbers], fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)
plt.gca().get_yaxis().set_major_formatter(lambda x, p: format(int(x), ','))  # Format y-axis labels with commas

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set the x-axis limits to reduce empty space on the left of Monday
plt.xlim((weekday_numbers[0] - bar_width * (len(daytime_categories) - 1) / 2) * 1.2, (weekday_numbers[-1] + bar_width * (len(daytime_categories) - 1) / 2) * 1.09)

# Show the legend
plt.legend(fontsize=font_chart, title='Daytime', loc='upper right')

plt.title('Mean Distance of Trips during School Holidays', fontsize=font_title, weight=weight_title)

plt.ylim(0, 18)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Mean_Distance_of_Trips_per_Weekday_per_Daytime_isSchoolHoliday_True.png', dpi=dpi)

plt.show()

# Count of Trips over Time

month

In [None]:
import calendar

# Create a copy of the DataFrame
df_copy = df.copy()
df_copy = df_copy[df_copy['year'] != 2022]

# Group the data by year and month and calculate the total count within each group
grouped_data = df_copy.groupby(['year', 'month'])['count'].sum().reset_index()

# Convert month number to month name
grouped_data['month'] = grouped_data['month'].apply(lambda x: calendar.month_name[x])

# Get unique months and years to determine the number of bars and their positions
months = grouped_data['month'].unique()
years = grouped_data['year'].unique()
num_months = len(months)
num_years = len(years)

# Plot a bar chart for the count of trips per month, grouped by year
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.2  # Adjust the width of the bars here

# Plot bars for each month of each year
for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + i * bar_width for j in range(num_months)]

    # Fill missing months with 0 count
    year_data = year_data.set_index('month').reindex(months).fillna(0).reset_index()

    plt.bar(x, year_data['count'], width=bar_width, alpha=alpha, label=str(year))

# Set x-axis labels and ticks
plt.xlabel('Month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.xticks([j + (num_years - 1) * bar_width / 2 for j in range(num_months)], months, fontsize=font_chart, rotation=45, ha='right')

# Set y-axis label
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.yticks(fontsize=font_chart)

# Set plot title
plt.title('Count of Trips per Month and Year', fontsize=font_title, weight=weight_title)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.ylim(0, 2.5e6)

# Show the legend
plt.legend(fontsize=font_chart)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_month.png', dpi=dpi)

plt.show()

In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()
df_copy = df_copy[df_copy['month'] == 1]

# Group the data by year and month and calculate the total count within each group
grouped_data = df_copy.groupby(['year', 'month'])['count'].sum().reset_index()

# Convert month number to month name
grouped_data['month'] = grouped_data['month'].apply(lambda x: calendar.month_name[x])

# Get unique months and years to determine the number of bars and their positions
months = grouped_data['month'].unique()
years = grouped_data['year'].unique()
num_months = len(months)
num_years = len(years)

# Plot a bar chart for the count of trips per month, grouped by year
plt.figure(figsize=(10, 8))  # Adjust the figure size if needed

bar_width = 0.15  # Adjust the width of the bars here
year_spacing = 0.02  # Adjust the spacing between years

for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + (i * (bar_width + year_spacing)) + (bar_width * (num_years - 1) / 2) for j in range(num_months)]

    # Fill missing months with 0 count
    year_data = year_data.set_index('month').reindex(months).reset_index()

    bars = plt.bar(x, year_data['count'], width=bar_width, alpha=1, label=str(year))

    # Display values on top of the bars
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval + 60000, round(yval / 1e6, 3), ha='center', va='top', fontsize=font_chart)

# Set x-axis labels and ticks
plt.xlabel('Month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.xticks([j + ((num_years - 1) * (bar_width + year_spacing) * 0.94)  for j in range(num_months)], months, fontsize=font_chart, ha='center')

# Set y-axis label
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.yticks(fontsize=font_chart)

# Set plot title
plt.title('Count of Trips per Month and Year', fontsize=font_title, weight=weight_title)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.ylim(0, 2e6)

# Show the legend
plt.legend(fontsize=font_chart)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_month_January.png', dpi=dpi)

plt.show()

In [None]:
import calendar

# Create a copy of the DataFrame
df_copy = df.copy()
df_copy = df_copy[df_copy['year'] != 2022]

# Group the data by year and month and calculate the total count within each group
grouped_data = df_copy.groupby(['year', 'month'])['count'].sum().reset_index()

# Convert month number to month name
grouped_data['month'] = grouped_data['month'].apply(lambda x: calendar.month_name[x])

# Get unique months and years to determine the number of bars and their positions
months = grouped_data['month'].unique()
years = grouped_data['year'].unique()
num_months = len(months)
num_years = len(years)

# Plot a bar chart for the count of trips per month, grouped by year
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed

bar_width = 0.2  # Adjust the width of the bars here

# Plot bars for each month of each year
for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + i * bar_width for j in range(num_months)]

    # Fill missing months with 0 count
    year_data = year_data.set_index('month').reindex(months).fillna(0).reset_index()

    plt.bar(x, year_data['count'], width=bar_width, alpha=alpha, label=str(year))

# Set x-axis labels and ticks
plt.xlabel('Month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.xticks([j + (num_years - 1) * bar_width / 2 for j in range(num_months)], months, fontsize=font_chart, rotation=45, ha='right')

# Set y-axis label
plt.ylabel('Number of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.yticks(fontsize=font_chart)

# Set plot title
plt.title('Number of Trips per Month and Year', fontsize=font_title, weight=weight_title)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.ylim(0, 2.5e6)

# Show the legend
plt.legend(fontsize=font_chart)
plt.tight_layout()

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_month_with2022.png', dpi=dpi)

plt.show()

In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()
df_copy = df_copy[df_copy['year'] != 2022]

# Calculate the total distance traveled (count * distance) for each row
df_copy['TotalDistance'] = df_copy['count'] * df_copy['distance']

# Group the data by year and month and calculate the total count and total distance within each group
grouped_data = df_copy.groupby(['year', 'month'])[['count', 'TotalDistance']].sum().reset_index()

# Convert month number to month name
grouped_data['month'] = grouped_data['month'].apply(lambda x: calendar.month_name[x])

# Get unique months and years to determine the number of bars and their positions
months = grouped_data['month'].unique()
years = grouped_data['year'].unique()
num_months = len(months)
num_years = len(years)

# Plot a bar chart for the total distance traveled per month, grouped by year
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed

bar_width = 0.2  # Adjust the width of the bars here

# Plot bars for each month of each year
for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + i * bar_width for j in range(num_months)]

    # Fill missing months with 0 count and 0 total distance
    year_data = year_data.set_index('month').reindex(months).fillna(0).reset_index()

    plt.bar(x, year_data['TotalDistance'], width=bar_width, alpha=alpha, label=str(year))

# Set x-axis labels and ticks
plt.xlabel('Month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.xticks([j + (num_years - 1) * bar_width / 2 for j in range(num_months)], months, fontsize=font_chart, rotation=45, ha='right')

# Set y-axis label
plt.ylabel('Distance Travelled [million km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.yticks(fontsize=font_chart)

# Set plot title
plt.title('Distance Travelled per Month and Year', fontsize=font_title, weight=weight_title)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.gca().yaxis.set_major_formatter(formatter)
plt.ylim(0, 1.6e7)


# Show the legend
plt.legend(fontsize=font_chart)
plt.tight_layout()
# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Total_Distance_Travelled_per_month.png', dpi=dpi)

plt.show()

In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()
df_copy = df_copy[df_copy['month'] == 1]

# Calculate the total distance traveled (count * distance) for each row
df_copy['TotalDistance'] = df_copy['count'] * df_copy['distance']

# Group the data by year and month and calculate the total count and total distance within each group
grouped_data = df_copy.groupby(['year', 'month'])[['count', 'TotalDistance']].sum().reset_index()

# Convert month number to month name
grouped_data['month'] = grouped_data['month'].apply(lambda x: calendar.month_name[x])

# Get unique months and years to determine the number of bars and their positions
months = grouped_data['month'].unique()
years = grouped_data['year'].unique()
num_months = len(months)
num_years = len(years)

# Plot a bar chart for the total distance traveled per month, grouped by year
plt.figure(figsize=(10, 7))  # Adjust the figure size if needed

bar_width = 0.1  # Adjust the width of the bars here

# Plot bars for each month of each year
for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + i * bar_width for j in range(num_months)]

    # Fill missing months with 0 count and 0 total distance
    year_data = year_data.set_index('month').reindex(months).fillna(0).reset_index()

    plt.bar(x, year_data['TotalDistance'], width=bar_width, alpha=alpha, label=str(year))

# Set x-axis labels and ticks
plt.xlabel('Month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.xticks([j + (num_years - 1) * bar_width / 2 for j in range(num_months)], months, fontsize=font_chart, rotation=45, ha='right')

# Set y-axis label
plt.ylabel('Distance Travelled [million km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.yticks(fontsize=font_chart)

# Set plot title
plt.title('Distance Travelled per Month and Year', fontsize=font_title, weight=weight_title)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.gca().yaxis.set_major_formatter(formatter)
plt.ylim(0, 1.3e7)


# Show the legend
plt.legend(fontsize=font_chart)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Total_Distance_Travelled_per_month_January.png', dpi=dpi)

plt.show()

## Quarter

In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()

total_counts_per_quarter = df_copy.groupby('quarter')['count'].sum()

# Get the unique quarters in the order you want them to appear on the x-axis
quarters = sorted(df_copy['quarter'].unique())

# Convert quarters to strings for x-axis labels
quarter_labels = [str(q) for q in quarters]

# Plot a bar chart for the count of trips per quarter
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(quarter_labels, total_counts_per_quarter, alpha=alpha, width=bar_width)
plt.xlabel('Quarter', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels to display quarter labels
plt.xticks(quarter_labels, fontsize=font_chart, rotation=45, ha='right')

# Modify y-axis
plt.yticks(plt.gca().get_yticks(), fontsize=font_chart)
plt.ylim(0, 15)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(height, 2)), ha='center', va='bottom', fontsize=font_chart)

plt.gca().yaxis.set_major_formatter(formatter)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Quarter.png', dpi=dpi)

plt.show()


In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()

df_copy = df_copy[df_copy['year'] != 2022]

total_counts_per_quarter = df_copy.groupby('year')['count'].sum()

# Get the unique quarters in the order you want them to appear on the x-axis
quarters = sorted(df_copy['year'].unique())

# Convert quarters to strings for x-axis labels
quarter_labels = [str(q) for q in quarters]

# Plot a bar chart for the count of trips per quarter
plt.figure(figsize=(8, 6))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(quarter_labels, total_counts_per_quarter, alpha=alpha, width=bar_width)
plt.xlabel('Year', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Number of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels to display quarter labels
plt.xticks(quarter_labels, fontsize=font_chart, ha='center')

# Modify y-axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 2)), ha='center', va='bottom', fontsize=font_chart)

plt.gca().yaxis.set_major_formatter(formatter)
plt.title("Number of Trips per Year", fontsize=font_title, weight=weight_title)
plt.ylim(0, max(total_counts_per_quarter) * 1.1)
plt.tight_layout()
# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Year.png', dpi=dpi)

plt.show()


In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()
df_copy = df_copy[df_copy['year'] != 2022]
df_copy['TotalDistance'] = df_copy['count'] * df_copy['distance']

total_counts_per_quarter = df_copy.groupby('year')['TotalDistance'].sum()

# Get the unique quarters in the order you want them to appear on the x-axis
quarters = sorted(df_copy['year'].unique())

# Convert quarters to strings for x-axis labels
quarter_labels = [str(q) for q in quarters]

# Plot a bar chart for the count of trips per quarter
plt.figure(figsize=(8, 6))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(quarter_labels, total_counts_per_quarter, alpha=alpha, width=bar_width)
plt.xlabel('Year', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Distance Travelled [million km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels to display quarter labels
plt.xticks(quarter_labels, fontsize=font_chart, ha='center')

# Modify y-axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 1)), ha='center', va='bottom', fontsize=font_chart)

plt.gca().yaxis.set_major_formatter(formatter)

plt.title("Distance Travelled per Year", fontsize=font_title, weight=weight_title)
plt.ylim(0, max(total_counts_per_quarter) * 1.1)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Distance_Travelled_per_Year.png', dpi=dpi)

plt.show()

In [None]:
total_counts_per_quarter = total_counts_per_quarter.reset_index()
total_counts_per_quarter

In [None]:
# Create an ExcelWriter object
excel_writer = pd.ExcelWriter('Distance_Travelled_per_Year.xlsx', engine='xlsxwriter')
total_counts_per_quarter.to_excel(excel_writer, sheet_name="Distance_Travelled_per_Year", index=False)
excel_writer.close()

# Count of Trips per Distance

In [None]:
grouped_data = df.copy()
grouped_data = grouped_data[grouped_data['year'] != 2022]
grouped_data['distance'] = grouped_data['distance'].apply(lambda x: '20+' if x >= 20 else x)


# Group the DataFrame by 'year' and 'distance' and calculate the total count within each group
grouped_data = grouped_data.groupby(['year', 'distance'])['count'].sum().reset_index()

# Get the unique years in the order you want them to appear
years = sorted(grouped_data['year'].unique())

# Create a bar plot with bars for each year
plt.figure(figsize=(10, 7))  # Adjust the figure size if needed

bar_width = 0.225  # Adjust the width of the bars here

# Plot bars for each year
for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + i * bar_width for j in range(len(year_data))]

    plt.bar(x, year_data['count'], width=bar_width, alpha=alpha, label=str(year))

plt.xlabel('Distance [km]', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Number of Trips [million]', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Number of Trips per Distance and Year', fontsize=font_title, fontweight=weight_title)

# Modify x-axis labels
plt.xticks([j + (len(years) - 1) * bar_width / 2 for j in range(len(grouped_data['distance'].unique()))], grouped_data['distance'].unique(), fontsize=font_chart)

plt.yticks(fontsize=font_chart)
plt.ylim(0, max(year_data['count']) * 1.2)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Show the legend
plt.legend(fontsize=font_chart)
plt.tight_layout()
# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Distance_Grouped_byYear.png', dpi=dpi)

plt.tight_layout()
plt.show()

In [None]:
year = grouped_data.copy()
year

In [None]:
# Copy the DataFrame to avoid modifying the original data
grouped_data = df.copy()

# Combine distances above 14 km into a single category
grouped_data['distance'] = grouped_data['distance'].apply(lambda x: '20+' if x >= 20 else x)

# Filter data for the years 2019 and April 2020 to March 2021
grouped_data = grouped_data[grouped_data['year'].isin([2019]) | ((grouped_data['year'] == 2020) & (grouped_data['month'].between(4, 12))) | ((grouped_data['year'] == 2021) & (grouped_data['month'].between(1, 3)))]

grouped_data['year'] = grouped_data['year'].apply(lambda x: 'During Covid-19' if x >= 2020 else 'Before Covid-19')

# Group the DataFrame by 'year', 'distance', and 'month' and calculate the total count within each group
grouped_data = grouped_data.groupby(['year', 'distance'])['count'].sum().reset_index()

# Get the unique distances and periods
distances = grouped_data['distance'].unique()
years = sorted(grouped_data['year'].unique())#, reverse=True)

# Create a bar plot with two bars (2019 and April 2020 to March 2021) for each distance
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.4  # Adjust the width of the bars here

# Plot bars for each distance
for i, year in enumerate(years):
    year_data = grouped_data[grouped_data['year'] == year]
    x = [j + i * bar_width for j in range(len(distances))]

    plt.bar(x, year_data['count'], width=bar_width, alpha=alpha, label=str(year))

    # Add y-values on top of each bar
    #for j, count in enumerate(year_data['count']):
    #    plt.text(x[j], count, f'{count/1e6:.2f}', ha='center', va='bottom', fontsize=(font_chart - 4))

plt.xlabel('Distance [km]', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Count of Trips per Distance Before and During Covid-19', fontsize=font_title, fontweight=weight_title)

# Modify x-axis labels
plt.xticks([j + (len(years) - 1) * bar_width / 2 for j in range(len(distances))], distances, fontsize=font_chart)

plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)
plt.ylim(0, 3.5e6)

# Show the legend
plt.legend(fontsize=font_chart)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Distance_Grouped_Covid.png', dpi=dpi)

plt.tight_layout()
plt.show()


In [None]:
years

In [None]:
grouped_data

In [None]:
excel_writer = pd.ExcelWriter('daten.xlsx', engine='xlsxwriter')

In [None]:
grouped_data.to_excel(excel_writer, sheet_name="Count_per_Distance_Covid", index=False)

In [None]:
daytime = df.groupby(['weekday', 'daytime', 'isSchoolHoliday'])['count'].sum().reset_index()

In [None]:
weekday = df.groupby(['weekday', 'isSchoolHoliday'])['count'].sum().reset_index()

In [None]:
daytime.to_excel(excel_writer, sheet_name="daytime", index=False)
weekday.to_excel(excel_writer, sheet_name="weekday", index=False)
excel_writer.close()

In [None]:
# Group the DataFrame by 'distance' and calculate the total count within each group
total_counts_per_distance = df.groupby('distance')['count'].sum()

# Get the unique distances in the order you want them to appear on the x-axis
distances = sorted(df['distance'].unique())

# Plot a histogram for each distance
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(distances, total_counts_per_distance, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 2)), ha='center', va='bottom', fontsize=font_chart)

# Set the x-axis limits to reduce empty space on the left and right
factor = 1.8
plt.xlim((distances[0] - bar_width / 2) * factor , (distances[-1] + bar_width / 2) * 1.015)


plt.gca().yaxis.set_major_formatter(formatter)

plt.title('Count of Trips per Distance Covid-19', fontsize=font_title, weight=weight_title)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Distance_Covid.png', dpi=dpi)

plt.show()


In [None]:
# Filter the DataFrame to include only rows where 'Bucket' is less than '2020-04'
filtered_df = df[df['year-month'] < '2020-04']

filtered_df['distance'] = filtered_df['distance'].apply(lambda x: 20 if x >= 20 else x)

# Group the filtered DataFrame by 'distance' and calculate the total count within each group
total_counts_per_distance = filtered_df.groupby('distance')['count'].sum()

# Get the unique distances in the order you want them to appear on the x-axis
distances = sorted(filtered_df['distance'].unique())
distances

# Plot a histogram for each distance
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(distances, total_counts_per_distance, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 2)), ha='center', va='bottom', fontsize=font_chart)

# Set the x-axis limits to reduce empty space on the left and right
factor = 1.8
plt.xlim((distances[0] - bar_width / 2) * factor , (distances[-1] + bar_width / 2) * 1.015)

plt.title('Count of Trips per Distance Pre-Covid-19', fontsize=font_title, weight=weight_title)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Distance_preCovid.png', dpi=dpi)

plt.show()


In [None]:
# Filter the DataFrame to include only rows where 'Bucket' is less than '2019-04'
filtered_df = df[df['year-month'] >= '2020-04']

# Group the filtered DataFrame by 'distance' and calculate the total count within each group
total_counts_per_distance = filtered_df.groupby('distance')['count'].sum()

# Get the unique distances in the order you want them to appear on the x-axis
distances = sorted(filtered_df['distance'].unique())

# Plot a histogram for each distance
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(distances, total_counts_per_distance, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips [million]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Remove y-axis offset (1e6 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, str(round(int(height) / 1e6, 2)), ha='center', va='bottom', fontsize=font_chart)

# Set the x-axis limits to reduce empty space on the left and right
factor = 1.8
plt.xlim((distances[0] - bar_width / 2) * factor , (distances[-1] + bar_width / 2) * 1.015)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Count_of_Trips_per_Distance_postCovid.png', dpi=dpi)

plt.show()


In [None]:
# Filter the DataFrame to include only rows where 'Bucket' is less than '2020-04'
filtered_df = df[df['year-month'] < '2020-03']

# Group the filtered DataFrame by 'distance' and calculate the total count within each group
total_counts_per_distance = filtered_df.groupby('distance')['count'].sum()

# Normalize the counts by dividing each count by the total sum of counts
normalized_counts_per_distance = total_counts_per_distance / total_counts_per_distance.sum()

# Get the unique distances in the order you want them to appear on the x-axis
distances = sorted(filtered_df['distance'].unique())

# Plot a histogram for each distance
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(distances, normalized_counts_per_distance, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1%}', ha='center', va='bottom', fontsize=font_chart)

# Remove y-axis offset (1e-1 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set the x-axis limits to reduce empty space on the left and right
factor = 1.8
plt.xlim((distances[0] - bar_width / 2) * factor, (distances[-1] + bar_width / 2) * 1.015)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Normalized_Count_of_Trips_per_Distance_preCovid.png', dpi=dpi)

plt.show()


In [None]:
# Filter the DataFrame to include only rows where 'Bucket' is less than '2020-04'
filtered_df = df[df['year-month'] >= '2020-03']

# Group the filtered DataFrame by 'distance' and calculate the total count within each group
total_counts_per_distance = filtered_df.groupby('distance')['count'].sum()

# Normalize the counts by dividing each count by the total sum of counts
normalized_counts_per_distance = total_counts_per_distance / total_counts_per_distance.sum()

# Get the unique distances in the order you want them to appear on the x-axis
distances = sorted(filtered_df['distance'].unique())

# Plot a histogram for each distance
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(distances, normalized_counts_per_distance, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Add y-values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1%}', ha='center', va='bottom', fontsize=font_chart)

# Remove y-axis offset (1e-1 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set the x-axis limits to reduce empty space on the left and right
factor = 1.8
plt.xlim((distances[0] - bar_width / 2) * factor, (distances[-1] + bar_width / 2) * 1.015)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Normalized_Count_of_Trips_per_Distance_postCovid.png', dpi=dpi)

plt.show()


In [None]:
# Assuming you have separate dataframes for before and after 2020-03
df_before_202003 = df[df['year-month'] < '2020-03']
df_after_202003 = df[df['year-month'] >= '2020-03']

# Group the DataFrames by 'distance' and calculate the total count within each group
total_counts_before = df_before_202003.groupby('distance')['count'].sum()
total_counts_after = df_after_202003.groupby('distance')['count'].sum()

# Normalize the counts by dividing each count by the total sum of counts
normalized_counts_before = total_counts_before / total_counts_before.sum()
normalized_counts_after = total_counts_after / total_counts_after.sum()

# Calculate the difference in normalized counts
difference = normalized_counts_after - normalized_counts_before

# Calculate the relative difference as a percentage
relative_difference = difference / normalized_counts_before * 100

# Replace any NaN or infinite values with zero
relative_difference = np.nan_to_num(relative_difference, nan=0.0, posinf=0.0, neginf=0.0)

# Get the unique distances in the order you want them to appear on the x-axis
distances = sorted(df['distance'].unique())

# Plot a bar chart to show the relative difference
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(distances, relative_difference, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Relative Difference in Normalized Count (%)', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Add percentage values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}%', ha='center', va='bottom', fontsize=font_chart)

# Remove y-axis offset (1e-1 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Save the plots as PNG images
#plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Relative_Difference_in_Normalized_Count.png', dpi=dpi)

plt.show()

## relative difference with cap of distance=19

In [None]:
# Assuming you have separate dataframes for before and after 2020-03
df_before_202003 = df[df['year-month'] < '2020-03']
df_after_202003 = df[df['year-month'] >= '2020-03']

# Cap the distance at 19 for both dataframes
df_before_202003['distance'] = df_before_202003['distance'].clip(upper=19)
df_after_202003['distance'] = df_after_202003['distance'].clip(upper=19)

# Create a set of unique distances that includes both capped distances and other unique distances
all_distances = sorted(set(df_before_202003['distance'].unique()) | set(df_after_202003['distance'].unique()))

# Group the DataFrames by 'distance' and calculate the total count within each group
total_counts_before = df_before_202003.groupby('distance')['count'].sum()
total_counts_after = df_after_202003.groupby('distance')['count'].sum()

# Normalize the counts by dividing each count by the total sum of counts
normalized_counts_before = total_counts_before / total_counts_before.sum()
normalized_counts_after = total_counts_after / total_counts_after.sum()

# Calculate the difference in normalized counts
difference = normalized_counts_after - normalized_counts_before

# Calculate the relative difference as a percentage
relative_difference = difference / normalized_counts_before * 100

# Replace any NaN or infinite values with zero
relative_difference = np.nan_to_num(relative_difference, nan=0.0, posinf=0.0, neginf=0.0)

# Plot a bar chart to show the relative difference
plt.figure(figsize=(12, 8))  # Adjust the figure size if needed

bar_width = 0.85  # Adjust the width of the bars here

bars = plt.bar(all_distances, relative_difference, alpha=alpha, width=bar_width)
plt.xlabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Relative Difference in Normalized Count [%]', fontsize=font_label, weight=weight_label, labelpad=pad_label)

# Modify x-axis labels
plt.xticks(all_distances, fontsize=font_chart)

# Modify y axis
plt.yticks(fontsize=font_chart)

# Add percentage values on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1f}', ha='center', va='bottom', fontsize=font_chart)

# Remove y-axis offset (1e-1 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

plt.title('Change of Trips per Distance Pre-Covid-19 vs. Covid-19', fontsize=font_title, weight=weight_title)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__Relative_Difference_in_Normalized_Count.png', dpi=dpi)

plt.show()


# Anzahl der Werte (n) Mit in die Grafik mitaufnehmen

pre covid richtig normalisieren sodass x bis 21 km geht und dann halt 0 ist
ODER post covid auf 19 cappen
vllt log10 ausprobiuern

ask chatgpt what analysis they would do

In [None]:
# Create a copy of the DataFrame
df_multivariate = df.copy()

# Convert the 'Bucket' column to datetime objects
df_multivariate['year-month'] = pd.to_datetime(df_multivariate['year-month'])

# Map the datetime objects to float values representing year and month in decimal form
df_multivariate['year-month'] = df_multivariate['year-month'].map(lambda x: x.year + x.month / 12)

# Create box plots or violin plots
plt.figure(figsize=(10, 6))
sns.boxplot(x='year-month', y='distance', data=df_multivariate)
plt.xlabel('year-month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.title('Distribution of Distance by year-month', fontsize=font_title, weight=weight_title)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='year-month', y='count', data=df_multivariate)
plt.xlabel('year-month', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Count of Trips', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.title('Distribution of Count by year-month', fontsize=font_title, weight=weight_title)
plt.show()


In [None]:
# Create a copy of the DataFrame
df_stats = df.copy()

df_stats['year-month'] = pd.to_datetime(df_stats['year-month'])

# Resample the 'Bucket' column to quarters
df_stats['year-month'] = df_stats['year-month'].dt.to_period('Q')

# Calculate the summary statistics for 'Count' and 'Distance' for each 'Bucket'
grouped_stats = df_stats.groupby('year-month')[['count', 'distance']].describe().reset_index()

# Sort the 'Bucket' values in ascending order
sorted_buckets = sorted(df_stats['year-month'].unique())

# Create violin plots for 'Distance'
plt.figure(figsize=(12, 8))
sns.violinplot(x='year-month', y='distance', data=df_stats, palette='rainbow', inner='quartile', order=sorted_buckets)
plt.xlabel('Bucket', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Distance [km]', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.title('Distance Distribution for Each Quarter', fontsize=font_title, weight=weight_title)

# Rotate x-labels 45 degrees
plt.xticks(rotation=45)

plt.show()


In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()

# Filter the data for a specific distance (e.g., distance = 0)
distance = 11
df_distance_0 = df_copy[df_copy['distance'] == distance]

# Group the data by 'year' and 'month' to calculate count per distance for each month
grouped_data = df_distance_0.groupby(['year', 'month'])['count'].sum().reset_index()

# Calculate the total count per month for all distances
total_counts_per_month = df_copy.groupby(['year', 'month'])['count'].sum().reset_index()

# Merge the total counts per month with the grouped data for the specific distance
grouped_data = grouped_data.merge(total_counts_per_month, on=['year', 'month'], suffixes=('_distance', '_total'))

# Calculate the normalized count for the specific distance
grouped_data['Normalized_Count'] = grouped_data['count_distance'] / grouped_data['count_total']

# Combine 'year' and 'month' into a new column as a custom date format
grouped_data['Date'] = grouped_data.apply(lambda row: pd.to_datetime(f"{int(row['year'])}-{int(row['month']):02d}"), axis=1)

# Fit linear regression to see how the normalized count develops over time
X = grouped_data['Date'].apply(lambda x: x.toordinal()).values.reshape(-1, 1)
y = grouped_data['Normalized_Count'].values

model = LinearRegression()
model.fit(X, y)

# Predict the normalized count values using the fitted model
y_pred = model.predict(X)

# Create the plot
plt.figure(figsize=(12, 8))
plt.scatter(grouped_data['Date'], grouped_data['Normalized_Count'], label='Normalized Count', alpha=alpha)
plt.plot(grouped_data['Date'], y_pred, color='red', label='Linear Regression', linewidth=2)

# Set x-axis tick format to show both year and month
plt.gca().xaxis.set_major_locator(mdates.MonthLocator((1, 4, 7, 10)))
plt.gca().xaxis.set_minor_locator(mdates.MonthLocator(bymonth=range(1, 13)))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Remove y-axis offset (1e-1 on the top left corner)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

# Set x-axis limit to show data between January 2019 and January 2022
plt.xlim(pd.Timestamp('2018-12-02'), pd.Timestamp('2022-01-31'))  # Add some spacing before and after

# Set y-axis limit to start from 0 and go as high as needed based on data
plt.ylim(0, y.max() * 1.2)

plt.xlabel('Date', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.title(f'Normalized Count Development for Distance = {distance} Over Time', fontsize=font_title, weight=weight_title)
plt.legend()
plt.xticks(rotation=45, fontsize=font_chart)

# Use ScalarFormatter for y-axis ticks to make them dynamic
plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter(useMathText=True))
plt.gca().ticklabel_format(axis='y', style='sci', scilimits=(-6, 6))  # Show in scientific notation

plt.yticks(fontsize=font_chart)

# Add grid in the background
plt.grid(True)
plt.show()

# Calculate and print the R-squared, coefficients, MSE, and RMSE for each segment
mse = np.mean((y - y_pred) ** 2)
rmse = np.sqrt(mse)
print("R-squared:", model.score(X, y))
print("Slope (Coefficient):", model.coef_[0])
print("Intercept:", model.intercept_)
print("MSE:", mse)
print("RMSE:", rmse)
print()

In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()

# Filter the data for a specific distance (e.g., distance = 0)
distance = 19

# Clip the distances to a maximum of 19
df_copy['distance'] = np.clip(df_copy['distance'], None, 19)

df_distance_0 = df_copy[df_copy['distance'] == distance]

# Group the data by 'year' and 'month' to calculate count per distance for each month
grouped_data = df_distance_0.groupby(['year', 'month'])['count'].sum().reset_index()

# Calculate the total count per month for all distances
total_counts_per_month = df_copy.groupby(['year', 'month'])['count'].sum().reset_index()

# Merge the total counts per month with the grouped data for the specific distance
grouped_data = grouped_data.merge(total_counts_per_month, on=['year', 'month'], suffixes=('_distance', '_total'))

# Calculate the normalized count for the specific distance
grouped_data['Normalized_Count'] = grouped_data['count_distance'] / grouped_data['count_total']

# Combine 'year' and 'month' into a new column as a custom date format
grouped_data['Date'] = grouped_data.apply(lambda row: pd.to_datetime(f"{int(row['year'])}-{int(row['month']):02d}"), axis=1)

# Convert the 'Date' column to datetime objects
grouped_data['Date'] = pd.to_datetime(grouped_data['Date'])

# Split the data into two parts: before and after 2020-03
data_before_2020_03 = grouped_data[grouped_data['Date'] < pd.Timestamp('2020-03-01')]
data_after_2020_03 = grouped_data[grouped_data['Date'] >= pd.Timestamp('2020-03-01')]

# Fit linear regression for data before 2020-03
X_before = data_before_2020_03['Date'].apply(lambda x: x.toordinal()).values.reshape(-1, 1)
y_before = data_before_2020_03['Normalized_Count'].values

model_before = LinearRegression()
model_before.fit(X_before, y_before)

# Fit linear regression for data after 2020-03
X_after = data_after_2020_03['Date'].apply(lambda x: x.toordinal()).values.reshape(-1, 1)
y_after = data_after_2020_03['Normalized_Count'].values

model_after = LinearRegression()
model_after.fit(X_after, y_after)

# Predict the normalized count values using the fitted models
y_pred_before = model_before.predict(X_before)
y_pred_after = model_after.predict(X_after)

# Get the maximum value of the y-axis to adjust the plot
y_max = max(y_before.max(), y_after.max())

# Create the plot
plt.figure(figsize=(12, 8))

# Scatter plots and linear regression lines for data before 2020-03
plt.scatter(data_before_2020_03['Date'], data_before_2020_03['Normalized_Count'], label='Normalized Count (Before 2020-03)', alpha=0.7)
plt.plot(data_before_2020_03['Date'], y_pred_before, color='red', linewidth=2)

# Scatter plots and linear regression lines for data after 2020-03
plt.scatter(data_after_2020_03['Date'], data_after_2020_03['Normalized_Count'], label='Normalized Count (After 2020-03)', alpha=0.7)
plt.plot(data_after_2020_03['Date'], y_pred_after, color='blue', linewidth=2)

# Set x-axis tick format to show both year and month
plt.gca().xaxis.set_major_locator(mdates.MonthLocator((1, 4, 7, 10)))
plt.gca().xaxis.set_minor_locator(mdates.MonthLocator(bymonth=range(1, 13)))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Set x-axis limit to show data between January 2019 and January 2022
plt.xlim(pd.Timestamp('2018-12-02'), pd.Timestamp('2022-01-31'))  # Add some spacing before and after

# Set y-axis limit to start from 0 and go as high as needed based on data
plt.ylim(0, y_max * 1.2)

plt.xlabel('Date', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.title(f'Normalized Count for Distance = {distance} Pre-Covid-19 vs. Covid-19', fontsize=font_title, weight=weight_title)
plt.legend(fontsize=font_chart)
plt.xticks(rotation=45, fontsize=font_chart)
plt.yticks(fontsize=font_chart)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + f'__normalized_count_distance_{distance}.png', dpi=dpi)

# Add grid in the background
plt.grid(True)

# Calculate and print the R-squared, coefficients, MSE, and RMSE for each segment
mse_before = np.mean((y_before - y_pred_before) ** 2)
rmse_before = np.sqrt(mse_before)
print("Before 2020-03:")
print("R-squared:", model_before.score(X_before, y_before))
print("Slope (Coefficient):", model_before.coef_[0])
print("Intercept:", model_before.intercept_)
print("MSE:", mse_before)
print("RMSE:", rmse_before)
print()

mse_after = np.mean((y_after - y_pred_after) ** 2)
rmse_after = np.sqrt(mse_after)
print("After 2020-03:")
print("R-squared:", model_after.score(X_after, y_after))
print("Slope (Coefficient):", model_after.coef_[0])
print("Intercept:", model_after.intercept_)
print("MSE:", mse_after)
print("RMSE:", rmse_after)
print()

plt.show()

In [None]:
from scipy.stats import ttest_ind

# Perform t-test to compare the means of normalized count of trips before and after March 2020
t_stat, p_value = ttest_ind(y_before, y_after)

# Print the p-value
print("P-value:", p_value)

In [None]:
# Create a copy of the DataFrame
df_copy = df.copy()

# Group the data by 'year', 'month', and 'distance' to calculate count for each distance for each month
grouped_data = df_copy.groupby(['year', 'month', 'distance'])['count'].sum().reset_index()

# Calculate the total count per month for all distances
total_counts_per_month = df_copy.groupby(['year', 'month'])['count'].sum().reset_index()

# Merge the total counts per month with the grouped data for all distances
grouped_data = grouped_data.merge(total_counts_per_month, on=['year', 'month'], suffixes=('_distance', '_total'))

# Calculate the normalized count for each distance
grouped_data['Normalized_Count'] = grouped_data['count_distance'] / grouped_data['count_total']

# Combine 'year' and 'month' into a new column as a custom date format
grouped_data['Date'] = grouped_data.apply(lambda row: pd.to_datetime(f"{int(row['year'])}-{int(row['month']):02d}"), axis=1)

# Fit linear regression to see how the normalized count develops over time for each distance
distances = grouped_data['distance'].unique()
plt.figure(figsize=(12, 8))

# Use the 'tab20c' colormap and repeat its colors to match the number of unique distances
colors = plt.cm.tab20c.colors[:len(distances)] * (len(distances) // len(plt.cm.tab20c.colors) + 1)

for i, distance in enumerate(distances):
    distance_data = grouped_data[grouped_data['distance'] == distance]
    X = distance_data['Date'].apply(lambda x: x.toordinal()).values.reshape(-1, 1)
    y = distance_data['Normalized_Count'].values

    model = LinearRegression()
    model.fit(X, y)

    # Predict the normalized count values using the fitted model
    y_pred = model.predict(X)

    plt.scatter(distance_data['Date'], distance_data['Normalized_Count'], label=f'Distance = {distance}', color=colors[i], alpha=alpha)
    plt.plot(distance_data['Date'], y_pred, linewidth=2, color=colors[i])

# Set x-axis tick format to show both year and month
plt.gca().xaxis.set_major_locator(mdates.MonthLocator((1, 4, 7, 10)))
plt.gca().xaxis.set_minor_locator(mdates.MonthLocator(bymonth=range(1, 13)))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Set x-axis limit to show data between January 2019 and January 2022
plt.xlim(pd.Timestamp('2018-12-02'), pd.Timestamp('2022-01-31'))  # Add some spacing before and after

plt.xlabel('Date', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.ylabel('Normalized Count of Trips', fontsize=font_label, weight=weight_label, labelpad=pad_label)
plt.title('Normalized Count Development for All Distances Over Time', fontsize=font_title, weight=weight_title)
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
df.Route.unique()

In [None]:
# Create a new column that combines 'StartName' and 'EndName' in sorted order
#df['Route'] = df.apply(lambda row: ' - '.join(sorted([row['startClusterName'], row['endClusterName']])), axis=1)

# Group the data by 'Route' and calculate the sum of 'count' for each group
grouped_data = df.groupby('Route')['count'].sum().reset_index()

# Sort the data by the sum of count in descending order to get the most trafficked routes
grouped_data = grouped_data.sort_values(by='count', ascending=False)

# Select the top 10 combinations as the most trafficked routes
top_10_combinations = grouped_data.head(10)

# Create a horizontal bar chart
plt.figure(figsize=(12, 8))
plt.barh(top_10_combinations['Route'], top_10_combinations['count'], color='skyblue')

# Add labels to the bars
for index, value in enumerate(top_10_combinations['count']):
    plt.text(value, index, f' {value * 1e-6:.2f}', va='center', fontsize=font_chart)

# Set axis labels and title
plt.xlabel('Number of Trips [million]', fontsize=font_label, fontweight='bold', labelpad=pad_label)
plt.ylabel('Routes (bidirectional, aggregated to municipalities)', fontsize=font_label, fontweight='bold', labelpad=pad_label)
plt.title('Top 10 Most Trafficked Routes', fontsize=font_title, fontweight='bold')
plt.yticks(fontsize=font_chart)
plt.xticks(fontsize=font_chart)

plt.xlim(0, max(top_10_combinations['count']) * 1.08)

plt.gca().xaxis.set_major_formatter(formatter)

# Invert the y-axis to show the most trafficked routes at the top
plt.gca().invert_yaxis()

plt.tight_layout()
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__top10MostTraffickedRoutesAbsolute.png', dpi=dpi)

plt.show()

In [None]:
# Create a new column that combines 'StartName' and 'EndName' in sorted order
df['Route'] = df.apply(lambda row: ' - '.join(sorted([row['startClusterName'], row['endClusterName']])), axis=1)

# Group the data by 'Route' and calculate the sum of 'count' for each group
grouped_data = df.groupby('Route')['count'].sum().reset_index()

# Sort the data by the sum of count in descending order to get the most trafficked routes
grouped_data = grouped_data.sort_values(by='count', ascending=False)

# Calculate the total count of all combinations
total_count = grouped_data['count'].sum()

# Calculate the share of each combination on the total count
grouped_data['Share'] = grouped_data['count'] / total_count * 100

grouped_data = grouped_data.head(20)

# Create a horizontal bar chart
plt.figure(figsize=(12, 8))
plt.barh(grouped_data['Route'], grouped_data['Share'], color='skyblue')

# Add labels to the bars
for index, value in enumerate(grouped_data['Share']):
    if f"{value:.2f}" == "0.00":
        plt.text(value, index, f" <0.01", va='center', fontsize=font_chart)
    else:
        plt.text(value, index, f" {value:.2f}", va='center', fontsize=font_chart)

# Set axis labels and title
plt.xlabel('Share of Count [%]', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Route (bidirectional, aggregated to municipalities)', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Ranked Share of Count across Routes', fontsize=font_title, fontweight=weight_title)

plt.xticks(fontsize=font_chart)
plt.yticks(fontsize=font_chart)

plt.xlim(0, max(grouped_data['Share']) * 1.08)

# Invert the y-axis to show the most trafficked routes at the top
plt.gca().invert_yaxis()

plt.tight_layout()

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__RankedShareOfCountAcrossRoutes.png', dpi=dpi)

plt.show()

In [None]:
df['startClusterID'] = df['startClusterID'].astype(str)
df['endClusterID'] = df['endClusterID'].astype(str)

# Create a new column that combines 'StartName' and 'EndName' in sorted order
df['Route'] = df.apply(lambda row: ' - '.join(sorted([row['startClusterID'], row['endClusterID']])), axis=1)

# Group the data by 'Route' and calculate the sum of 'count' for each group
grouped_data = df.groupby('Route')['count'].sum().reset_index()

# Sort the data by the sum of count in descending order to get the most trafficked routes
grouped_data = grouped_data.sort_values(by='count', ascending=False)

# Calculate the total count of all combinations
total_count = grouped_data['count'].sum()

# Calculate the share of each combination on the total count
grouped_data['Share'] = grouped_data['count'] / total_count * 100

# Create a horizontal bar chart
plt.figure(figsize=(12, 8))
plt.bar(grouped_data['Route'], grouped_data['Share'], color='skyblue')

# Set axis labels and title
plt.xlabel('Route', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Share of Total Count [%]', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Distribution of Count across Routes', fontsize=font_title, fontweight=weight_title)

plt.xticks(rotation=45, ha='right', fontsize=0.001)
plt.yticks(fontsize=font_chart)

plt.ylim(0, max(grouped_data['Share']) * 1.08)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__DistributionOfCountAcrossRoutes.png', dpi=dpi)

plt.tight_layout()
plt.show()

In [None]:
# Create a new column that combines 'StartName' and 'EndName' in sorted order
df['Route'] = df.apply(lambda row: '- '.join(sorted([row['startClusterID'], row['endClusterID']])), axis=1)

# Group the data by 'Route' and calculate the sum of 'count' for each group
grouped_data = df.groupby('Route')['count'].sum().reset_index()

# Sort the data by the sum of count in descending order to get the most trafficked routes
grouped_data = grouped_data.sort_values(by='count', ascending=False)

# Calculate the total count of all combinations
total_count = grouped_data['count'].sum()

# Calculate the share of each combination on the total count
grouped_data['Share'] = grouped_data['count']

# Create a horizontal bar chart
plt.figure(figsize=(12, 8))
plt.bar(grouped_data['Route'], grouped_data['Share'], color='skyblue')

# Set axis labels and title
plt.xlabel('Route', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Total Count of Trips', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Distribution of Count across Routes (log10)', fontsize=font_title, fontweight=weight_title)

plt.xticks(rotation=45, ha='right', fontsize=0.001)
plt.yticks(fontsize=font_chart)

plt.yscale('log')

plt.ylim(0, 5e6)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__DistributionOfCountAcrossRoutes_log.png', dpi=dpi)

plt.tight_layout()
plt.show()

In [None]:
grouped_data

In [None]:
# Group the data by 'StartName' and 'EndName' and calculate the sum of 'count' for each group
grouped_data = df.groupby(['StartName', 'EndName']).agg({
    'count': 'sum',
    'StartLon': 'first',
    'StartLat': 'first',
    'EndLon': 'first',
    'EndLat': 'first'
}).reset_index()
# Sort the data by the sum of count in descending order to get the most trafficked routes
grouped_data = grouped_data.sort_values(by='count', ascending=False)
grouped_data

visualize the verteilung der aller routen aber nur nach anzahl

routen hin und zurück nicht einzeln zählen

statistische relevant über p<0.05 corona auswerten