In [1]:
import pandas as pd
import math
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 判斷是否為假日
def is_holiday(day):
    # 星期日：day % 7 == 0，星期六：day % 7 == 6，國定假日
    return (day % 7 == 0) or (day % 7 == 6) or (day in [1, 8, 29, 37, 50])

In [4]:
# 讀取數據
df = pd.read_csv('data/CityC Challenge Data.csv')

In [5]:
# 計算歐式距離
def euclidean_distance(loc1, loc2):
    """計算兩個地點之間的歐式距離"""
    dist=math.sqrt((loc1[0] - loc2[0])**2 + (loc1[1] - loc2[1])**2)
    #print("dist=",dist)
    return dist
# 找家與工作地
def find_home_and_work_via_time_slots(group, morning_latest, evening_earliest, evening_latest, morning_earliest):
    # 找出早上通勤到晚上通勤之間的工作時間段
    work_time_group = group[(group['t'] >= morning_latest) & (group['t'] <= evening_earliest)]
    work_location_counter = Counter(zip(work_time_group['x'], work_time_group['y']))
    work_most_common = work_location_counter.most_common(1)  # 找到出現最多次地點作為工作地
    
    # 找出晚上通勤到早上通勤之間的在家時間段
    home_time_group = group[(group['t'] >= evening_latest) | (group['t'] <= morning_earliest)]
    home_location_counter = Counter(zip(home_time_group['x'], home_time_group['y']))
    home_most_common = home_location_counter.most_common(1)  # 找到出現最多次地點作為家
    
    # 檢查是否找到家與工作地，避免出現空列表的情況
    home = home_most_common[0][0] if home_most_common else None
    work = work_most_common[0][0] if work_most_common else None
    
    return home, work

In [6]:
# 判斷是否為通勤
def is_commuting(row1, row2):
    dist = euclidean_distance((row1['x'], row1['y']), (row2['x'], row2['y']))
    if dist >= 6:
        return True
    else:
        return False

In [7]:
# 分析通勤模式
def analyze_commuting_pattern(group):
    commuting_periods = []
    for i in range(len(group) - 1):
        if is_commuting(group.iloc[i], group.iloc[i+1]):
            commuting_periods.append(group.iloc[i]['t'])  # 記錄通勤時間段
    return commuting_periods

In [8]:
def filter_non_continuous_periods(commuting_periods, threshold=2):
    """
    過濾掉不連續的時間段，相鄰時間段之間的差距必須小於等於 threshold，否則被踢除。
    """
    if(commuting_periods!=[]):
        filtered_periods = [commuting_periods[0]]  # 保留第一個時間段
        for i in range(1, len(commuting_periods)):
            if abs(commuting_periods[i] - commuting_periods[i - 1]) <= threshold:
                filtered_periods.append(commuting_periods[i])
        return filtered_periods
    else:
        return []

In [9]:
#分成早上通勤和晚上通勤
def split_time_slot(commuting_periods):
    """
    將通勤時間段分為早上通勤和晚上通勤，並找出各自最常見的前四個時間段。
    
    Args:
    - commuting_periods (list): 通勤時間段的列表
    
    Returns:
    - morning_top4 (list): 早上通勤最常見的前四個時間段
    - evening_top4 (list): 晚上通勤最常見的前四個時間段
    """
    morning_commuting = []
    evening_commuting = []
    
    # 將通勤時段分類為早上和晚上
    for i in commuting_periods:
        if i <= 24:
            morning_commuting.append(i)
        else:
            evening_commuting.append(i)
    
    # 計算早上和晚上的時間段頻率
    morning_counter = Counter(morning_commuting)
    evening_counter = Counter(evening_commuting)
    print("morning_counter=",morning_counter)
    print("evening_counter=",evening_counter)
    # 找出最常見的前四個時間段
    morning_top4 = [time for time, count in morning_counter.most_common(4)]
    evening_top4 = [time for time, count in evening_counter.most_common(4)]
    # 過濾掉不連續的時間段
    morning_top4_filtered = filter_non_continuous_periods(sorted(morning_top4))
    evening_top4_filtered = filter_non_continuous_periods(sorted(evening_top4))

    #如果morning_top4_filtered或evening_top4_filtered為空
    if morning_top4_filtered==[]:
        morning_top4_filtered=[13,14,15,16]
    if evening_top4_filtered==[]:
        evening_top4_filtered=[34,35,36,37]
    return morning_top4_filtered, evening_top4_filtered

In [10]:
def linear_interpolate(location1, location2, t, t_start, t_end):
    """
    根據兩個位置點進行線性插值，計算給定時間點的中間位置。

    Parameters:
    location1: tuple (x1, y1) - 開始點的座標
    location2: tuple (x2, y2) - 結束點的座標
    t: int - 需要插值的時間點
    t_start: int - 開始點的時間
    t_end: int - 結束點的時間

    Returns:
    (x, y) - 插值後的座標
    morning_earliest= 14 morning_latest= 16 evening_earliest= 34 evening_latest= 37
    uid= 2 home= (76, 92) work= (99, 109)
    """
    # 計算比例
    if t_end == t_start:
        return location1  # 避免除以0的情況

    ratio = (t - t_start) / (t_end - t_start)
    x = location1[0] + ratio * (location2[0] - location1[0])
    y = location1[1] + ratio * (location2[1] - location1[1])
    #print("ratio=",ratio,"x=",x,"y=",y)
    return round(x), round(y)

In [11]:
# Step 1: 取出 uid <= 3000 的資料作為測試集
test_df = df[~df['d'].apply(is_holiday)]
test_df = test_df [test_df['uid'] <= 1000]
test_df.to_csv('answer_weekday.csv', index=False)

# Step 2: 將 61-74 日的 x 和 y 欄位設為 999（即需要預測的部分）
test_df.loc[(test_df['d'] >= 61) & (test_df['d'] <= 74), ['x', 'y']] = 999

# Step 3: 填充缺失的 x 和 y
uids = test_df['uid'].unique()

for uid in uids:
    user_data = test_df[test_df['uid'] == uid]
    # 找出該用戶的通勤時間段
    commuting_periods = analyze_commuting_pattern(user_data)
    # 找出早上與晚上的通勤時段
    morning_commuting, evening_commuting = split_time_slot(commuting_periods)
    morning_earliest = min(morning_commuting)  # 早上最早的通勤時間
    morning_latest = max(morning_commuting)  # 早上最晚的通勤時間
    evening_earliest = min(evening_commuting)  # 晚上最早的通勤時間
    evening_latest = max(evening_commuting)  # 晚上最晚的通勤時間

    print("morning_earliest=",morning_earliest,"morning_latest=",morning_latest,"evening_earliest=",evening_earliest,"evening_latest=",evening_latest)

    # 找出該用戶 60 天內的家與工作地
    home, work = find_home_and_work_via_time_slots(user_data[user_data['d'] <= 60], 
                                                   morning_latest, evening_earliest, 
                                                   evening_latest, morning_earliest)
    print("uid=",uid,"home=",home,"work=",work)
    # 填寫 61-74 天的預測值
    if home is not None and work is not None:
        # 在 61-74 日的家時間段填 home，工作時間段填 work
        for i, row in user_data.iterrows():
            if 61 <= row['d'] <= 74:
                
                if morning_latest <= row['t'] <= evening_earliest:
                    #填入工作地
                    test_df.at[i, 'x'], test_df.at[i, 'y'] = work 
                elif row['t'] >= evening_latest or row['t'] <= morning_earliest:
                    #填入家
                    test_df.at[i, 'x'], test_df.at[i, 'y'] = home
                elif row['t']>morning_earliest and row['t']<morning_latest:
                    # 線性插值
                    interpolated_location = linear_interpolate( home, work, row['t'], morning_earliest, morning_latest)
                    test_df.at[i, 'x'], test_df.at[i, 'y'] = interpolated_location
                elif row['t']>evening_earliest and row['t']<evening_latest:
                    interpolated_location = linear_interpolate( work,home, row['t'], evening_earliest, evening_latest)
                    test_df.at[i, 'x'], test_df.at[i, 'y'] = interpolated_location

# Step 4: 將結果保存成 CSV 檔案
test_df.to_csv('fill_weekday.csv', index=False)

print("預測結果已保存到 fill_weekday.csv")

morning_counter= Counter({13: 6, 14: 6, 15: 5, 16: 5, 12: 2, 19: 2, 20: 1, 21: 1, 22: 1, 23: 1})
evening_counter= Counter({38: 4, 39: 3, 40: 3, 41: 3, 36: 2, 37: 2, 42: 2, 43: 2, 44: 1, 31: 1, 47: 1, 46: 1})
morning_earliest= 13 morning_latest= 16 evening_earliest= 38 evening_latest= 41
uid= 0 home= (28, 157) work= (28, 157)
morning_counter= Counter({13: 2, 24: 2, 12: 1, 23: 1, 18: 1})
evening_counter= Counter({34: 5, 25: 3, 26: 3, 36: 3, 35: 2, 33: 2, 31: 2, 42: 1, 43: 1, 40: 1, 41: 1, 46: 1, 45: 1, 37: 1, 28: 1, 29: 1, 32: 1})
morning_earliest= 12 morning_latest= 24 evening_earliest= 25 evening_latest= 36
uid= 1 home= (36, 192) work= (36, 192)
morning_counter= Counter({15: 10, 16: 4, 17: 3, 18: 3, 19: 3, 22: 3, 24: 3, 14: 2, 12: 2, 20: 2, 21: 2, 23: 2, 13: 1, 11: 1})
evening_counter= Counter({32: 10, 27: 6, 28: 6, 31: 4, 33: 4, 29: 2, 26: 2, 34: 2, 30: 1, 25: 1, 47: 1})
morning_earliest= 15 morning_latest= 18 evening_earliest= 27 evening_latest= 32
uid= 2 home= (34, 187) work= (34, 1

In [14]:
import geobleu
from geobleu import geobleu
import pandas as pd

# 讀取 answer.csv 和 fill.csv
answer_df = pd.read_csv('answer_weekday.csv')
fill_df = pd.read_csv('fill_weekday.csv')
answer_df=answer_df[answer_df['d']>=60]
fill_df=fill_df[fill_df['d']>=60]

# 初始化變量存儲所有分數
total_similarity = 0
total_dtw=0
count = 0

# 假設這兩個檔案都有 'uid', 'd', 't', 'x', 'y' 五個欄位
uids = answer_df['uid'].unique()

for uid in uids:
    # 取出每個 uid 的資料
    answer_user_data = answer_df[answer_df['uid'] == uid]
    fill_user_data = fill_df[fill_df['uid'] == uid]
    
    # 將時間順序與座標結合，形成 (d, t, x, y) 格式的列表
    reference_coords = list(zip(answer_user_data['d'], answer_user_data['t'], answer_user_data['x'], answer_user_data['y']))
    generated_coords = list(zip(fill_user_data['d'], fill_user_data['t'], fill_user_data['x'], fill_user_data['y']))
    
    # 計算該使用者的 GeoBLEU 分數
    similarity = geobleu.calc_geobleu(generated_coords, reference_coords, processes=3)
    dtw_val = geobleu.calc_dtw(generated_coords, reference_coords, processes=3)

    # 累加分數
    total_similarity += similarity
    total_dtw += dtw_val
    count += 1

# 計算平均分數
average_similarity = total_similarity / count if count > 0 else 0
avg_dtw = total_dtw / count if count > 0 else 0
print(f'Average GeoBLEU score: {average_similarity}')
print("avg_dtw : {}".format(avg_dtw))

Average GeoBLEU score: 0.3215419851690486
avg_dtw : 25.67837286363072
