In [100]:
import pandas as pd
from pathlib import Path

In [101]:
SOURCE_DIR = Path('../files/output/csv_output/runs_results/')
TARGET_DIR = Path('../files/output/csv_output/treat_runs')
RAW_CSV = SOURCE_DIR/'all_direct_routes_3_run_raw.csv'

In [102]:
# 1. Reading raw csv
df_raw= pd.read_csv(RAW_CSV, index_col=None)
df_raw

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,280,331,2,16,415
1,280,207,2,23,330
2,107,297,1,46,180
3,297,216,2,5,100
4,107,108,3,16,78
...,...,...,...,...,...
983736,348,117,2,69,1370
983737,212,297,2,46,1070
983738,297,117,2,174,1530
983739,364,123,3,16,685


In [103]:
# How many duplicates
dupls = df_raw.duplicated()
dupls.value_counts()

True     845185
False    138556
dtype: int64

In [104]:
# 2. Removing full duplicates
df_val = df_raw.drop_duplicates(ignore_index=True)
df_val

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,280,331,2,16,415
1,280,207,2,23,330
2,107,297,1,46,180
3,297,216,2,5,100
4,107,108,3,16,78
...,...,...,...,...,...
138551,13,187,1,290,810
138552,118,171,1,51,875
138553,163,156,1,130,695
138554,118,78,1,156,390


In [105]:
df_val.to_csv(TARGET_DIR/'all_direct_routes_3_run.csv', index=False)

In [106]:
# 3. Sorting in price ascending order
df = df_val.sort_values(by=['from_id', 'to_id', 'transport_id', 'price_min_EUR'], ignore_index=True)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,8,9,1,311,340
1,8,10,1,183,690
2,8,11,1,229,545
3,8,12,1,199,1435
4,8,13,1,119,150
...,...,...,...,...,...
138551,663,552,1,674,1185
138552,663,553,1,562,1570
138553,663,554,1,548,1660
138554,663,626,1,506,1475


In [107]:
# 4. Removing duplicates by triples 'from_id', 'to_id', 'transport_id'
df.drop_duplicates(['from_id', 'to_id', 'transport_id'], inplace=True, ignore_index=True)
df

Unnamed: 0,from_id,to_id,transport_id,price_min_EUR,duration_min
0,8,9,1,311,340
1,8,10,1,183,690
2,8,11,1,229,545
3,8,12,1,199,1435
4,8,13,1,119,150
...,...,...,...,...,...
82019,663,550,1,494,1760
82020,663,552,1,416,990
82021,663,553,1,562,1570
82022,663,554,1,548,1660


In [108]:
df.duplicated(['from_id', 'to_id', 'transport_id']).value_counts()

False    82024
dtype: int64

In [109]:
# 5. Create index for res.csv files and Cutting from_id.csv files
TARGET_DIR.mkdir(parents=True, exist_ok=True)

frames = []
for from_id in df['from_id'].unique():
    
    temp_df = df[df['from_id'] == from_id]
    
    temp_df.index = from_id * 10_000 + range(1, temp_df.shape[0] + 1)
    
    temp_df.index.name = 'path_id'
    
    #temp_df.to_csv(f'{TARGET_DIR}/{from_id}.csv')
    
    frames.append(temp_df)
    
res_df = pd.concat(frames)

res_df.to_csv(f'{TARGET_DIR}/treat3run.csv')

In [110]:
res_df

Unnamed: 0_level_0,from_id,to_id,transport_id,price_min_EUR,duration_min
path_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
80001,8,9,1,311,340
80002,8,10,1,183,690
80003,8,11,1,229,545
80004,8,12,1,199,1435
80005,8,13,1,119,150
...,...,...,...,...,...
6630281,663,550,1,494,1760
6630282,663,552,1,416,990
6630283,663,553,1,562,1570
6630284,663,554,1,548,1660


In [113]:
res_df['transport_id'].value_counts()

1     72432
2      6265
3      2026
8      1215
10       86
Name: transport_id, dtype: int64