In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import joblib
import os

In [None]:
model = joblib.load('D:\\personal_repositories\\travel_mode_detection\\artifacts\\decision_tree_trained_model.pkl')
class_encoder = joblib.load('D:\\personal_repositories\\travel_mode_detection\\artifacts\\label_encoder.joblib')

In [None]:
# read the shape file
shapefile_path = 'U:\Projects\Huq\Faraz\huq_city_data\Shapefiles\msoa_intzone_boundaries\glasgow\msoa_glasgow.shp'
gdf = gpd.read_file(shapefile_path)
gdf = gdf.to_crs(epsg=4326)
gdf.index
pass

In [None]:
year = 2021
data_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\processed_trip_points_data.csv'
processed_data = pd.read_csv(data_path)
processed_data = processed_data[['year', 'distance_threshold', 'uid', 'imd_quintile', 'trip_id',
       'total_active_days', 'lat', 'lng', 'org_lat', 'org_lng',
       'dest_lat', 'dest_lng', 'datetime', 'num_of_impressions', 'time_taken',
       'prev_lat', 'prev_long', 'distance_covered', 'speed', 'date', 'hour',
       'speed_z_score', 'new_speed', 'accelaration', 'jerk', 'bearing',
       'angular_deviation', 'month', 'is_weekend', 'hour_category',
       'start_end_at_bus_stop', 'start_end_at_train_stop',
       'start_end_at_metro_stop', 'found_at_green_space',
       'straightness_index']]

In [None]:

attributes = ['month',
          'speed_median','speed_pct_95','speed_std',	
          'acceleration_median','acceleration_pct_95','acceleration_std',
          'jerk_median','jerk_pct_95','jerk_std',
          'angular_dev_median','angular_dev_pct_95','angular_dev_std',
          'straightness_index','distance_covered','start_end_at_bus_stop','start_end_at_train_stop','start_end_at_metro_stop','found_at_green_space','is_weekend','hour_category']

data_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\huq_stats_df_for_ml.csv'
data = pd.read_csv(data_path, parse_dates=['datetime'])
data['month'] = data['datetime'].dt.month
# keep the mode of month for each uid and trip_id
data['month'] = data.groupby(['uid', 'trip_id'])['month'].transform(lambda x: x.mode()[0]) # some night trips change the month. So, we keep the mode of month for each trip
data = data.drop_duplicates(subset=attributes)



In [None]:
pred = model.predict(data[attributes])
pred = class_encoder.inverse_transform(pred)
data['travel_mode'] = pred

processed_data = processed_data.merge(data[['uid', 'trip_id', 'travel_mode']], on=['uid', 'trip_id'], how='left')
op_df = processed_data[['uid', 'trip_id','org_lat', 'org_lng', 'dest_lat', 'dest_lng', 'lat', 'lng', 'datetime', 'travel_mode']]

# Add origin geo code
geometry = [Point(xy) for xy in zip(op_df['org_lng'], op_df['org_lat'])]
op_df = gpd.GeoDataFrame(op_df, crs="EPSG:4326", geometry=geometry)
op_df = op_df.sjoin(gdf[['geo_code', 'geometry']],  how='left', predicate='intersects')
op_df = op_df.rename(columns={'geo_code': 'org_geo_code'})
op_df = op_df.drop(columns=['geometry', 'index_right'])


# Add destination geo code
geometry = [Point(xy) for xy in zip(op_df['dest_lng'], op_df['dest_lat'])]
op_df = gpd.GeoDataFrame(op_df, crs="EPSG:4326", geometry=geometry)
op_df = op_df.sjoin(gdf[['geo_code', 'geometry']],  how='left', predicate='intersects')
op_df = op_df.rename(columns={'geo_code': 'dest_geo_code'})
op_df = op_df.drop(columns=['geometry', 'index_right'])

op_df.loc[:, 'trip_num'] = pd.factorize(op_df[['uid', 'trip_id']].apply(tuple, axis=1))[0] + 1
op_df = op_df.drop(columns=['uid', 'trip_id'])
op_df = op_df[['trip_num', 'org_geo_code', 'dest_geo_code','lat', 'lng', 'datetime', 'travel_mode']]
op_df = op_df.rename(columns={'trip_num': 'trip_id', 'lat': 'tp_lat', 'lng': 'tp_lng'})
op_df = op_df.dropna(subset = ['travel_mode'])
assert op_df['travel_mode'].isna().sum() == 0
op_df

In [None]:
op_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\validation'
os.makedirs(op_path, exist_ok=True)
op_df.to_csv(f'{op_path}\\predicted_travel_modes.csv', index=False)

In [None]:
tdf = op_df.drop_duplicates(subset=['trip_id'])
# calculate the percentage of each travel mode
tdf['travel_mode'].value_counts(normalize=True) * 100

In [None]:
year = 2019
op_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\validation'
tmd_df1 = pd.read_csv(f'{op_path}\\predicted_travel_modes.csv')

year = 2020
op_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\validation'
tmd_df2 = pd.read_csv(f'{op_path}\\predicted_travel_modes.csv')

year = 2021
op_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\validation'
tmd_df3 = pd.read_csv(f'{op_path}\\predicted_travel_modes.csv')

year = 2022
op_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\validation'
tmd_df4 = pd.read_csv(f'{op_path}\\predicted_travel_modes.csv')

year = 2023
op_path = f'U:\Projects\Huq\Faraz\\travel_mode_detection\Glasgow\\{year}\\validation'
tmd_df5 = pd.read_csv(f'{op_path}\\predicted_travel_modes.csv')



In [None]:
# calculate the percentage of each travel mode for each year an create a dataframe the contains columns year, column for each travel mode and the percentage of that travel mode
tmd_df1['year'] = 2019
tmd_df2['year'] = 2020
tmd_df3['year'] = 2021
tmd_df4['year'] = 2022
tmd_df5['year'] = 2023

tmd_df1 = tmd_df1.drop_duplicates(subset=['trip_id'])
tmd_df2 = tmd_df2.drop_duplicates(subset=['trip_id'])
tmd_df3 = tmd_df3.drop_duplicates(subset=['trip_id'])
tmd_df4 = tmd_df4.drop_duplicates(subset=['trip_id'])
tmd_df5 = tmd_df5.drop_duplicates(subset=['trip_id'])

tmd_df1

In [None]:
tmd_df1= tmd_df1.groupby(['year', 'travel_mode']).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1) * 100
tmd_df2= tmd_df2.groupby(['year', 'travel_mode']).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1) * 100
tmd_df3= tmd_df3.groupby(['year', 'travel_mode']).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1) * 100
tmd_df4= tmd_df4.groupby(['year', 'travel_mode']).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1) * 100
tmd_df5= tmd_df5.groupby(['year', 'travel_mode']).size().unstack().fillna(0).apply(lambda x: x/x.sum(), axis=1) * 100

In [None]:
tmd_df = pd.concat([tmd_df1, tmd_df2, tmd_df3, tmd_df4, tmd_df5], axis=0)
# make the df have two decimal points
tmd_df = tmd_df.round(2)
tmd_df