In [208]:
import os
import shutil
import datetime as dt
import csv
import json
import pandas as pd

from resources import ConfigS3
define = ConfigS3()

import config as CONFIG
import boto3
client_s3 = boto3.client('s3',
                    region_name=CONFIG.S3_REGION,
                    aws_access_key_id=CONFIG.S3_ACCESS_KEY,
                    aws_secret_access_key=CONFIG.S3_SECRET_ACCESS_KEY
                    )

PERIOD_LENGTH = 5  # minutes
SOURCE_DIR = os.path.join('.', 'tomtom-voh')
TRAIN_DIR = os.path.join('.', 'is_hot_tomtom_segment_status_with_velocity')

if not os.path.exists(TRAIN_DIR):
    os.makedirs(TRAIN_DIR)

def velocity_to_los(velocity):
    if velocity < 15:
        return 'F'
    elif velocity < 20:
        return 'E'
    elif velocity < 25:
        return 'D'
    elif velocity < 30:
        return 'C'
    elif velocity < 35:
        return 'B'
    else:
        return 'A'


def los_to_velocity(los):
    los_to_velocity = {
        'A': 35,
        'B': 30,
        'C': 25,
        'D': 20,
        'E': 15,
        'F': 10,
    }
    return los_to_velocity[los] or 45


def parse_date_and_period(timestamp):
    ts = dt.datetime.fromtimestamp(timestamp)
    date, time, weeekday = ts.date(), ts.time(), ts.weekday()

    h, m, s = time.hour, time.minute, time.second

    hour = f"0{h}" if h < 10 else str(h)
    step = (m * 60 + s) // (PERIOD_LENGTH * 60)
    m = PERIOD_LENGTH * step
    minute = f"0{m}" if m < 10 else str(m)
    period = f"period_{hour}_{minute}"
    is_morning = 1 if h<=12 else 0
    return str(date), period, weeekday, is_morning


def reset():
    shutil.rmtree(TRAIN_DIR)

def get_period_from_timestamp(timestamp):
	timestamp = dt.datetime.fromtimestamp(timestamp)
	hour = timestamp.hour
	minute = timestamp.minute

	if (hour >= 0 and hour <= 5) or (hour >= 9 and hour <= 15) or (hour >= 19 and hour <= 23):
		return "period_{hour}".format(hour=hour)
	if (hour == 24):
		return 'period_0'
	if (minute >= 30):
		return "period_{hour}_30".format(hour=hour)
	return "period_{hour}".format(hour=hour)

def get_seg_weather_data(timestamp, weather_data):
    weather = ""
    temperature = ""
    try:
        time_data = weather_data[str(timestamp)]
        weather = time_data["weather"][0]["main"]
        temperature = time_data["main"]["temp"]
    except:
        pass
    return weather, temperature

In [209]:
seg_dicts = {}
with open('selected_points.json', 'r') as f:
    cover_points = json.load(f)
for record in cover_points:
    for seg_item in record["segment_ids"]:
        seg_dicts[seg_item["segment_id"]] = record["district"]

In [210]:
def get_df_from_json(csv_file_path):
    output = []
    try:
        f = csv_file_path.split('/')[-1]
        timestamp = f.split('.')[0]
        date, period, weekday, is_morning = parse_date_and_period(int(timestamp))
        response = client_s3.get_object(Bucket = CONFIG.S3_BUCKET, Key = csv_file_path)
        # Read data
        data = json.load(response['Body'])
        for k, v in data.items():
            isHot = True
            base_LOS = velocity_to_los(v['velocity'])
            output.append([period, k, date, 
                weekday, v['velocity'], base_LOS, isHot, "few clouds", 27, seg_dicts[int(k)], is_morning])
        
    except Exception as e:
        print("An error occurred:", e)
        
    header = ['period', 'segment_id',
                'date', 'weekday', 'tomtom_velocity', 'base_LOS', 'isHot', 'weather', 'temperature', 'district', 'is_morning']
    df_result = pd.DataFrame(output, columns=header)
    
    return df_result


In [211]:
test_path = "tomtom-voh/2023-04-19/1681948757.json"
df_result = get_df_from_json(test_path)

In [212]:
df_result.head(5)

Unnamed: 0,period,segment_id,date,weekday,tomtom_velocity,base_LOS,isHot,weather,temperature,district,is_morning
0,period_06_55,23795,2023-04-20,3,18,E,True,few clouds,27,quan_go_vap,1
1,period_06_55,60703,2023-04-20,3,18,E,True,few clouds,27,quan_go_vap,1
2,period_06_55,23794,2023-04-20,3,18,E,True,few clouds,27,quan_go_vap,1
3,period_06_55,60704,2023-04-20,3,18,E,True,few clouds,27,quan_go_vap,1
4,period_06_55,60538,2023-04-20,3,18,E,True,few clouds,27,quan_go_vap,1


In [213]:
def get_duration(x, df_data):
    try:
        temp_data = df_data.loc[(df_data.segment_id == x.segment_id)&(df_data.is_morning == x.is_morning)].sort_values(by=['period'], ascending = False).iloc[-1]
        if len(temp_data) > 0:
            if temp_data.base_LOS == x.base_LOS:
                result_duration = int(temp_data.duration) + 1
            else:
                result_duration = 1
        else:
    #         print(temp_data.segment_id)
            result_duration = 1
    except:
        result_duration = 1
    return result_duration
def get_duration_with_velocity(x, df_data):
    try:
        temp_data = df_data.loc[(df_data.segment_id == x.segment_id)&(df_data.is_morning == x.is_morning)].sort_values(by=['period'], ascending = False).iloc[-1]
        if len(temp_data) > 0:
            if temp_data.tomtom_velocity == x.tomtom_velocity:
                result_duration = int(temp_data.duration) + 1
            else:
                result_duration = 1
        else:
    #         print(temp_data.segment_id)
            result_duration = 1
    except:
        result_duration = 1
    return result_duration

In [214]:
def get_result_df(list_path_files):
    result_df = get_df_from_json(list_path_files[0]["file_path"])
    for index, i in enumerate(list_path_files):
        temp_df = get_df_from_json(i["file_path"])
        if index == 0:
            result_df['duration'] = [1]*temp_df.shape[0]
        else:
#             duration_list = temp_df.apply(get_duration, axis=1, args=(result_df,))
            duration_list = temp_df.apply(get_duration_with_velocity, axis=1, args=(result_df,))
            temp_df['duration'] = duration_list
            result_df = pd.concat([result_df, temp_df])

    return result_df

In [215]:
list_date = ["2023-04-19", "2023-04-20", "2023-04-21", "2023-04-22", "2023-04-23", "2023-04-24", "2023-04-25", "2023-04-26", "2023-04-27", "2023-04-28", "2023-04-29", "2023-04-30", "2023-05-01", "2023-05-02", "2023-05-03", "2023-05-04", "2023-05-05", "2023-05-06", "2023-05-07", "2023-05-08", "2023-05-09", "2023-05-10", "2023-05-11"]
for date_item in list_date:
    folder_path = 'tomtom-voh/' + date_item
    list_file = define.bucket.objects.filter(Prefix=folder_path)
    list_path_files = []
    for obj in list_file:
        f = obj.key.split('/')[-1]
        timestamp = f.split('.')[0]
        date, period, weekday, _ = parse_date_and_period(int(timestamp))
        list_path_files.append(
            {
                "key":period,
                "file_path": obj.key
            }
        )
    list_path_files = sorted(list_path_files, key=lambda k: k['key']  , reverse=True)
    df_result = get_result_df(list_path_files)
    df_file_name = date_item + ".csv"
    df_result.to_csv(os.path.join(TRAIN_DIR, df_file_name))
    s3_key = os.path.join("Result_with_velocity", df_file_name)
    define.upload_file_to_s3(os.path.join(TRAIN_DIR, df_file_name),s3_key)

In [184]:
list_path_files = sorted(list_path_files, key=lambda k: k['key']  , reverse=True)

In [194]:
df_result = get_result_df(list_path_files)

period_19_55
period_19_55
period_19_50
period_19_45
period_19_40
period_19_30
period_19_25
period_19_20
period_19_15
period_19_10
period_19_05
period_19_00
period_18_55
period_18_50
period_18_45
period_18_40
period_18_35
period_18_30
period_18_25
period_18_20
period_18_15
period_18_10
period_18_05
period_18_00
period_17_55
period_17_50
period_17_45
period_17_40
period_17_35
period_17_30
period_17_25
period_17_20
period_17_15
period_17_10
period_17_05
period_17_00
period_16_55
period_16_50
period_16_45
period_16_40
period_16_30
period_16_25
period_16_20
period_16_15
period_16_10
period_16_05
period_16_00
period_15_55
period_10_25
period_10_20
period_10_15
period_10_10
period_10_05
period_10_00
period_09_55
period_09_50
period_09_45
period_09_40
period_09_35
period_09_30
period_09_25
period_09_20
period_09_15
period_09_10
period_09_05
period_09_00
period_08_55
period_08_50
period_08_45
period_08_40
period_08_35
period_08_30
period_08_25
period_08_20
period_08_15
period_08_10
period_08_05

In [203]:
date_item = "2023-04-23"
df_file_name = date_item + ".csv"
df_result.to_csv(os.path.join(TRAIN_DIR, df_file_name))

In [204]:
s3_key = os.path.join("Result", df_file_name)
define.upload_file_to_s3(os.path.join(TRAIN_DIR, df_file_name),s3_key)