In [402]:
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import numpy as np

In [403]:
SOURCE_DIR = Path('../files/output/csv_output')
TARGET_DIR = Path('../files/output/csv_output/treat2run')
SOURCE_CSV = SOURCE_DIR/'all_direct_routes_2_run.csv'

In [404]:
# 1. Reading raw 2_run csv
df = pd.read_csv(SOURCE_CSV, index_col=None)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,309,19,1,295,400
1,134,19,1,304,410
2,280,331,2,17,415
3,331,19,1,314,585
4,280,207,2,23,330
...,...,...,...,...,...
856749,348,117,2,69,1370
856750,212,297,2,46,1070
856751,297,117,2,175,1530
856752,364,123,3,16,685


In [416]:
df[df['from_id'] == 225]['transport_id'].value_counts()

1     272
2      50
3      23
8      18
10      6
Name: transport_id, dtype: int64

In [405]:
# 2. Removing full duplicates
df.drop_duplicates(inplace=True, ignore_index=True)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,309,19,1,295,400
1,134,19,1,304,410
2,280,331,2,17,415
3,331,19,1,314,585
4,280,207,2,23,330
...,...,...,...,...,...
107576,13,187,1,290,810
107577,118,171,1,51,875
107578,163,156,1,130,695
107579,118,78,1,157,390


In [406]:
# 3. Removing all pathes from_19 and to_19 (Kabul)
filter_del_fly_from_19 = (df['from_id'] == 19) & (df['transport_id'] == 1)
filter_del_fly_to_19 = (df['to_id'] == 19) & (df['transport_id'] == 1)
df_filter = df[filter_del_fly_to_19 | filter_del_fly_from_19]
df.drop(df_filter.index, inplace=True)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
2,280,331,2,17,415
4,280,207,2,23,330
6,107,297,1,46,180
7,297,216,2,5,100
8,107,108,3,16,78
...,...,...,...,...,...
107576,13,187,1,290,810
107577,118,171,1,51,875
107578,163,156,1,130,695
107579,118,78,1,157,390


In [407]:
# 3a. Transport type distribution for complete set
df['transport_id'].value_counts()

1     87026
2     11678
3      6752
8      1114
10      378
Name: transport_id, dtype: int64

In [408]:
# 4. Sorting in price ascending order
df.sort_values(by=['from_id', 'to_id', 'transport_id', 'price_min_EUR'], inplace=True, ignore_index=True)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,8,9,1,314,340
1,8,10,1,184,690
2,8,11,1,231,545
3,8,13,1,120,150
4,8,15,1,314,430
...,...,...,...,...,...
106943,663,552,1,418,990
106944,663,552,1,677,1185
106945,663,553,1,564,1570
106946,663,626,1,508,1475


In [409]:
# 5. Removing duplicates by triples 'from_id', 'to_id', 'transport_id'
df.drop_duplicates(['from_id', 'to_id', 'transport_id'], inplace=True, ignore_index=True)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,8,9,1,314,340
1,8,10,1,184,690
2,8,11,1,231,545
3,8,13,1,120,150
4,8,15,1,314,430
...,...,...,...,...,...
76626,663,547,1,507,1815
76627,663,550,1,497,1760
76628,663,552,1,418,990
76629,663,553,1,564,1570


In [410]:
# 6. Create from_id.csv files
TARGET_DIR.mkdir(parents=True, exist_ok=True)
for from_id in df['from_id'].unique():
    temp_df = df[df['from_id'] == from_id]
    temp_df.index = from_id * 10_000 + range(1, temp_df.shape[0] + 1)
    temp_df.to_csv(f'{TARGET_DIR}/{from_id}.csv', index_label='path_id')

In [419]:
# 3. Transport type distribution for a specific origin
from_id = 136
df[df['from_id'] == from_id].transport_id.value_counts()

1     95
2     39
3     20
8      3
10     1
Name: transport_id, dtype: int64