In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from fiona.crs import from_epsg

In [None]:
csv = "combined_data_MP_NE_dT_cC_Coord.csv"
df_csv = pd.read_csv(csv)
df_csv

In [None]:
shapefile_path = 'hildesheim_merged.shp'
gdf_shape = gpd.read_file(shapefile_path)
gdf_shape

In [None]:
# Create Point geometries from coordinates
df_csv['StartPoint'] = df_csv.apply(lambda row: Point(row['startLon'], row['startLat']), axis=1)
df_csv['EndPoint'] = df_csv.apply(lambda row: Point(row['endLon'], row['endLat']), axis=1)

In [None]:
df_csv

In [None]:
# Create separate GeoDataFrames for start and end points
gdf_start = gpd.GeoDataFrame(df_csv, geometry='StartPoint', crs=from_epsg(4326))
gdf_end = gpd.GeoDataFrame(df_csv, geometry='EndPoint', crs=from_epsg(4326))

In [None]:
# Reproject the shapefile to match the CRS of the GeoDataFrames
gdf_shape = gdf_shape.to_crs(gdf_start.crs)

In [None]:
# Perform spatial join to filter rows where both start and end points are inside the shapefile
gdf_start_inside = gpd.sjoin(gdf_start, gdf_shape, op='intersects', how='inner')
gdf_end_inside = gpd.sjoin(gdf_end, gdf_shape, op='intersects', how='inner')

In [None]:
gdf_start_inside

In [None]:
gdf_end_inside

In [None]:
df_result = gdf_start_inside.merge(gdf_end_inside, how='inner', on=['year', 'month', 'startClusterName', 'startClusterZip', 'startClusterID', 'startID', 'endClusterName', 'endClusterZip', 'endClusterID', 'endID', 'weekday', 'daytime', 'isSchoolHoliday', 'distance', 'count', 'count_corrected', 'startLon', 'startLat', 'endLon', 'endLat'])
df_result

In [None]:
df_result.drop(columns=['StartPoint_x', 'StartPoint_y', 'index_right_x', 'id_x', 'index_right_y', 'id_y'], inplace=True)
df_result

In [None]:
df_result.drop(columns=['EndPoint_x', 'EndPoint_y'], inplace=True)
df_result

In [None]:
df_result.to_csv('combined_data_MP_NE_hildesheim_merged_Coord' + '.csv', index=False)

# new try

In [None]:
shapefile_path = 'hildesheim_merged.shp'
gdf_shape = gpd.read_file(shapefile_path)
gdf_shape

In [None]:
csv_shapes = "desd-4-landkreis-hildesheim-1663168323029-shapes.csv"
df_shapes = pd.read_csv(csv_shapes)
df_shapes

In [None]:
df_shapes['StartPoint'] = df_shapes.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
df_shapes

In [None]:
gdf_shapes = gpd.GeoDataFrame(df_shapes, geometry='StartPoint', crs=from_epsg(4326))
gdf_shapes

In [None]:
# Reproject the shapefile to match the CRS of the GeoDataFrames
gdf_shape = gdf_shape.to_crs(gdf_shapes.crs)
gdf_shape

In [None]:
gdf_shapes_inside = gpd.sjoin(gdf_shapes, gdf_shape, op='intersects', how='inner')
gdf_shapes_inside

In [None]:
gdf_shapes_inside['FID'] = gdf_shapes_inside['FID'].astype(str).apply(lambda x: x.split('_')[0])
gdf_shapes_inside

In [None]:
unique_names = gdf_shapes_inside['FID'].unique()
unique_names

In [None]:
gdf_unique_names = gdf_shapes_inside[~gdf_shapes_inside['FID'].duplicated(keep="first")]
gdf_unique_names

In [None]:
csv = "combined_data_MP_NE_dT_cC_Coord.csv"
df_csv = pd.read_csv(csv)
df_csv['startClusterID'] = df_csv['startClusterID'].astype(str)
df_csv['endClusterID'] = df_csv['endClusterID'].astype(str)
#df_csv.drop(columns=['Unnamed: 0'], inplace=True)
df_csv

In [None]:
condition1 = df_csv['startClusterID'].isin(unique_names)
condition2 = df_csv['endClusterID'].isin(unique_names)
df_filtered = df_csv[condition1 & condition2]
df_filtered

In [None]:
start_cluster_ids = df_filtered['startClusterID'].unique()
end_cluster_ids = df_filtered['endClusterID'].unique()
union_ids = np.union1d(start_cluster_ids, end_cluster_ids)
union_ids

In [None]:
df_filtered.to_csv(csv.split('.')[0] + '_hildesheimMerged.csv', index=False)

# group and sum

In [None]:
# Group the data by 'StartName' and 'EndName' and calculate the sum of 'count' for each group
grouped_data = df_filtered.groupby(['startClusterID', 'startClusterName']).agg({
    'count': 'sum',
    'startLon': 'first',
    'startLat': 'first',
}).reset_index()
# Sort the data by the sum of count in descending order to get the most trafficked routes
grouped_data = grouped_data.sort_values(by='count', ascending=False)
grouped_data

In [None]:
grouped_data.to_csv(csv.split(".")[0] + "_hildesheimMerged_sumCountPerStartClusterID.csv", index=False)

# analysis

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
#matplotlib.rc_file_defaults()
mpl.rcParams.update(mpl.rcParamsDefault)
# Set the font family to Arial
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Arial']

dpi = 500
font_title = 16
font_label = 15
font_chart = 13
weight_label = 'bold'
weight_title = 'bold'
pad_label = 10
alpha = 0.7

In [None]:
# Calculate the total count of all combinations
total_count = grouped_data['count'].sum()

# Create a horizontal bar chart
plt.figure(figsize=(10, 7))
plt.bar(grouped_data['startClusterID'], grouped_data['count'], color='skyblue')


# Add a vertical line at the 30th percentile
plt.axvline(x=92, color='red', linestyle='--', label='30th Percentile', alpha=alpha)

# Highlight the region around the 30th percentile
plt.axvspan(xmin=92, xmax=132, color='red', alpha=0.05, label='Bottom 30%')

# Set axis labels and title
plt.xlabel('Cluster (sorted in descending order) ', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Number of Trips', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Distribution of the Number of Trips per Cluster (log10 scaled)', fontsize=font_title, fontweight=weight_title, va='bottom')

plt.xticks(rotation=45, ha='right', fontsize=1)
plt.yticks(fontsize=font_chart)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

plt.yscale('log')

plt.ylim(1, 1e7)
plt.legend(fontsize=font_chart)

# Save the plots as PNG images
plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__DistributionOfCountAcrossCluster_log.png', dpi=dpi)

plt.tight_layout()
plt.show()

In [None]:
# Calculate normalized counts
normalized_counts = grouped_data['count'] / grouped_data['count'].sum()

# Create a histogram
plt.figure(figsize=(10, 7))
# Create the histogram
plt.bar(grouped_data['startClusterID'], normalized_counts, color='skyblue', edgecolor='black')

# Add a vertical line at the 30th percentile
plt.axvline(x=92, color='red', linestyle='--', label='30th Percentile')

# Set axis labels and title
plt.xlabel('Cluster (sorted in descending order)', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.ylabel('Normalized Count', fontsize=font_label, fontweight=weight_label, labelpad=pad_label)
plt.title('Distribution of Normalized Cluster Frequencies', fontsize=font_title, fontweight=weight_title)

plt.xticks(rotation=45, ha='right', fontsize=1)
plt.yticks(fontsize=font_chart)
plt.gca().get_yaxis().get_offset_text().set_visible(False)

plt.legend(fontsize=font_chart)

# Save the plot as a PNG image
#plt.savefig('/Users/timon/Documents/ba/abbildungen/' + csv.split('.')[0] + '__DistributionOfNormalizedClusterFrequencies.png', dpi=dpi)

plt.tight_layout()
plt.show()

In [None]:
np.percentile(normalized_counts, 1) * 100

In [None]:
np.percentile(grouped_data['count'], 1)

In [None]:
# Calculate the 30th percentile of normalized counts
percentile_30 = np.percentile(normalized_counts, 30) # top 1: 99.242424

# Calculate the sum of normalized counts for the bottom 30%
sum_bottom_30_percentage = round(np.sum(normalized_counts[normalized_counts <= percentile_30]) * 100, 4)
sum_bottom_30_percentage

In [None]:
grouped_data['count'].describe()