In [188]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time
from itertools import product

# Function for Question 1
def calculate_distance_matrix(df):
    unique_ids = sorted(set(df['id_start'].unique()) | set(df['id_end'].unique()))
    distance_matrix = pd.DataFrame(index=unique_ids, columns=unique_ids)
    distance_matrix = distance_matrix.fillna(0)

    for index, row in df.iterrows():
        start_id, end_id, distance = row['id_start'], row['id_end'], row['distance']
        distance_matrix.at[start_id, end_id] += distance
        distance_matrix.at[end_id, start_id] += distance 

    for i in unique_ids:
        for j in unique_ids:
            for k in unique_ids:
                if distance_matrix.at[i, k] > 0 and distance_matrix.at[k, j] > 0:
                    if distance_matrix.at[i, j] == 0 or distance_matrix.at[i, j] > distance_matrix.at[i, k] + distance_matrix.at[k, j]:
                        distance_matrix.at[i, j] = distance_matrix.at[i, k] + distance_matrix.at[k, j]

    for i in unique_ids:
        distance_matrix.at[i, i] = 0
    return distance_matrix

# Function for Question 2
def unroll_distance_matrix(distance_matrix):
    distance_matrix = calculate_distance_matrix(df)

    dfs = []
    for i in distance_matrix.index:
        for j in distance_matrix.columns:
            if i != j:
                dfs.append(pd.DataFrame({'id_start': [i], 'id_end': [j], 'distance': [distance_matrix.at[i, j]]}))

    unrolled_df = pd.concat(dfs, ignore_index=True)
    
    return unrolled_df

# Function for Question 3
def find_ids_within_ten_percentage_threshold(unrolled_df, reference_id):
    reference_rows = df[df['id_start'] == reference_id]

    # Calculate the average distance for the reference_id
    reference_avg_distance = reference_rows['distance'].mean()

    # Calculate the threshold as 10% of the average distance
    threshold = 0.1 * reference_avg_distance

    # Filter rows where id_start's average distance is within the threshold
    result_df = df.groupby('id_start')['distance'].mean().reset_index()
    result_df = result_df[(result_df['distance'] >= (reference_avg_distance - threshold)) &
                          (result_df['distance'] <= (reference_avg_distance + threshold))]

    # Sort the result DataFrame by id_start
    result_df = result_df.sort_values(by='id_start')

    return result_df

# Function for Question 4
def calculate_toll_rate(unrolled_df):
    rate_coefficients = {'moto': 0.8, 'car': 1.2, 'rv': 1.5, 'bus': 2.2, 'truck': 3.6}

    for vehicle_type, rate_coefficient in rate_coefficients.items():
        unrolled_df[vehicle_type] = unrolled_df['distance'] * rate_coefficient

    unrolled_df = unrolled_df.drop(columns=['distance'])
    
    return unrolled_df
# Function for Question 5
def calculate_time_based_toll_rates(within_threshold_df):
    def calculate_discount_factor(row, vehicle):
        if row['start_time'] <= time(10, 0, 0) or (row['start_day'] in ['Saturday', 'Sunday'] and row['end_day'] in ['Saturday', 'Sunday']):
            return 0.7
        elif time(10, 0, 0) < row['start_time'] <= time(18, 0, 0):
            return 1.2
        else:
            return 0.8

    time_intervals = {
        'start_day': [],
        'start_time': [],
        'end_day': [],
        'end_time': [],
    }
    for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
        for start_time, end_time in [(time(0, 0, 0), time(10, 0, 0)), (time(10, 0, 0), time(18, 0, 0)), (time(18, 0, 0), time(23, 59, 59))]:
            time_intervals['start_day'].append(day)
            time_intervals['start_time'].append(start_time)
            time_intervals['end_day'].append(day)
            time_intervals['end_time'].append(end_time)

    time_df = pd.DataFrame(time_intervals)

    merged_df = pd.merge(df, time_df, how='cross')

    vehicle_columns = ['moto', 'car', 'rv', 'bus', 'truck']

    merged_df[vehicle_columns] = df[vehicle_columns].values[:, None]

    for vehicle in vehicle_columns:
        merged_df[vehicle] *= merged_df.apply(calculate_discount_factor, axis=1, vehicle=vehicle)

    return merged_df




In [2]:
df=pd.read_csv('dataset-3.csv')
print(df)

    id_start   id_end  distance
0    1001400  1001402       9.7
1    1001402  1001404      20.2
2    1001404  1001406      16.0
3    1001406  1001408      21.7
4    1001408  1001410      11.1
5    1001410  1001412      15.6
6    1001412  1001414      18.2
7    1001414  1001416      13.2
8    1001416  1001418      13.6
9    1001418  1001420      12.9
10   1001420  1001422       9.6
11   1001422  1001424      11.4
12   1001424  1001426      18.6
13   1001426  1001428      15.8
14   1001428  1001430       8.6
15   1001430  1001432       9.0
16   1001432  1001434       7.9
17   1001434  1001436       4.0
18   1001436  1001438       9.0
19   1001436  1001437       5.0
20   1001438  1001437       4.0
21   1001438  1001440      10.0
22   1001440  1001442       3.9
23   1001442  1001488       4.5
24   1001488  1004356       4.0
25   1004356  1004354       2.0
26   1004354  1004355       2.0
27   1004355  1001444       0.7
28   1001444  1001446       6.6
29   1001446  1001448       9.6
30   100

In [192]:

result_matrix1= calculate_distance_matrix(df)
result_matrix2= unroll_distance_matrix(result_matrix1)
reference_id = result_matrix2['id_start'].iloc[0]
result_matrix3= find_ids_within_ten_percentage_threshold(result_matrix2, reference_id)
result_matrix4= calculate_toll_rate(result_matrix2)
result_matrix5 = calculate_time_based_toll_rates(result_matrix3)

print(result_matrix5)
#print(result_matrix3)


KeyError: "None of [Index(['moto', 'car', 'rv', 'bus', 'truck'], dtype='object')] are in the [columns]"