In [3]:
import pandas as pd
import math
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

In [4]:
# 計算歐式距離
def euclidean_distance(loc1, loc2):
    """計算兩個地點之間的歐式距離"""
    dist=math.sqrt((loc1[0] - loc2[0])**2 + (loc1[1] - loc2[1])**2)
    #print("dist=",dist)
    return dist
# 判斷是否為假日
def is_weekend(day):
    # 星期日：day % 7 == 0，星期六：day % 7 == 6
    return (day % 7 == 0) or (day % 7 == 6)
# 找常去地點 p1, p2, p3
def find_places(group, morning_earliest, morning_latest, afternoon_earliest, afternoon_latest, evening_earliest, evening_latest,home_x,home_y):
    # 找出第一個常去地點 (上午)
    # 篩選上午時間段
    group=group[group['d']<=59]
    if morning_earliest is not None and morning_latest is not None:
        p1_time_group = group[(group['t'] >= morning_earliest) & (group['t'] <= morning_latest)]
    else:
        p1_time_group = group  # 如果條件為 None，則不篩選
    p1_location_counter = Counter(zip(p1_time_group['x'], p1_time_group['y']))
    p1_most_common = p1_location_counter.most_common(1)
    # 篩選下午時間段
    if afternoon_earliest is not None and afternoon_latest is not None:
        p2_time_group = group[(group['t'] >= afternoon_earliest) & (group['t'] <= afternoon_latest)]
    else:
        p2_time_group = group  # 如果條件為 None，則不篩選

    # 找出出現最多次的地點 (下午)
    p2_location_counter = Counter(zip(p2_time_group['x'], p2_time_group['y']))
    p2_most_common = p2_location_counter.most_common(1)

    # 篩選晚上/凌晨時間段
    if evening_earliest is not None and morning_earliest is not None:
        p3_time_group = group[(group['t'] >= evening_earliest) | (group['t'] <= morning_earliest)]
    else:
        p3_time_group = group  # 如果條件為 None，則不篩選

    # 找出出現最多次的地點 (晚上/凌晨)
    p3_location_counter = Counter(zip(p3_time_group['x'], p3_time_group['y']))
    p3_most_common = p3_location_counter.most_common(1)

    # 檢查是否找到家與工作地，避免出現空列表的情況
    p1 = p1_most_common[0][0] if p1_most_common else (home_x, home_y)
    p2 = p2_most_common[0][0] if p2_most_common else (home_x, home_y)
    p3 = p3_most_common[0][0] if p3_most_common else (home_x, home_y)
    
    return p1, p2, p3

# 判斷是否為通勤
def is_commuting(row1, row2):
    dist = euclidean_distance((row1['x'], row1['y']), (row2['x'], row2['y']))
    if dist >= 6:
        return True
    else:
        return False
# 分析通勤模式
def analyze_commuting_pattern(group):
    group=group[group['d']<=59]
    commuting_periods = []
    for i in range(len(group) - 1):
        if is_commuting(group.iloc[i], group.iloc[i+1]):
            commuting_periods.append(group.iloc[i]['t'])  # 記錄通勤時間段
    return commuting_periods
def filter_non_continuous_periods(commuting_periods, threshold=2):
    """
    過濾掉不連續的時間段，相鄰時間段之間的差距必須小於等於 threshold，否則被踢除。
    """
    if(commuting_periods!=[]):
        filtered_periods = [commuting_periods[0]]  # 保留第一個時間段
        for i in range(1, len(commuting_periods)):
            if abs(commuting_periods[i] - commuting_periods[i - 1]) <= threshold:
                filtered_periods.append(commuting_periods[i])
        return filtered_periods
    else:
        return []
#分成早上、下午、晚上通勤
def split_time_slot(commuting_periods):
    morning_commuting = []
    afternoon_commuting=[]
    evening_commuting = []
    
    # 將通勤時段分類為早上下午和晚上
    for i in commuting_periods:
        if 12 <= i < 24:
            morning_commuting.append(i)
        elif 24 <= i < 34:
            afternoon_commuting.append(i)
        elif 34 <= i < 44:
            evening_commuting.append(i)
    
    # 計算早上和晚上的時間段頻率
    morning_counter = Counter(morning_commuting)
    afternoon_counter = Counter(afternoon_commuting)
    evening_counter = Counter(evening_commuting)
    # print("morning_counter=",morning_counter)
    # print("afternoon_counter=",afternoon_counter)
    # print("evening_counter=",evening_counter)
    # 找出最常見的前四個時間段
    morning_top4 = [time for time, count in morning_counter.most_common(4)]
    afternoon_top4 = [time for time, count in afternoon_counter.most_common(4)]
    evening_top4 = [time for time, count in evening_counter.most_common(4)]
    # 過濾掉不連續的時間段
    morning_top4_filtered = filter_non_continuous_periods(sorted(morning_top4))
    afternoon_top4_filtered = filter_non_continuous_periods(sorted(afternoon_top4))
    evening_top4_filtered = filter_non_continuous_periods(sorted(evening_top4))

    return morning_top4_filtered,afternoon_top4_filtered, evening_top4_filtered
def linear_interpolate(location1, location2, t, t_start, t_end):
    # 計算比例
    if t_end == t_start:
        return location1  # 避免除以0的情況

    ratio = (t - t_start) / (t_end - t_start)
    x = location1[0] + ratio * (location2[0] - location1[0])
    y = location1[1] + ratio * (location2[1] - location1[1])
    #print("ratio=",ratio,"x=",x,"y=",y)
    return round(x), round(y)

In [5]:
# 計算兩個點 (x1, y1) 和 (x2, y2) 之間的距離
def calculate_distance(x1, y1, x2, y2):
    return sqrt((x2 - x1)**2 + (y2 - y1)**2)

def classify_person(df_person, threshold=0.5):
    """
    1. 統計過去60天中的每個週末（禮拜六和禮拜日）的所有時間段 t，
       舉例：假設 8 個週末中的 t=10 時段，有 4 個週末的 t=10 都是在同一地點，
       那麼固定比率為 0.5，若 >= 我們設定的 threshold 值，則此時段（t=10）為一個固定時段。
    2. 計算所有固定時段，除以所有週末的總時段，若超過 threshold 則分類為規律人，反之為不規律人。
    """

    # 轉換 d 值為禮拜六和禮拜日，使用 %7 將日期轉換成禮拜幾，6 是禮拜六，7 是禮拜日
    df_person = df_person.copy() 
    df_person['weekday'] = df_person['d'] % 7
    df_person = df_person[df_person['d'] <= 59]

    weekends_data = df_person[df_person['weekday'].isin([6, 0])]  # 0 表示禮拜日，6 表示禮拜六
    
    # 建立一個字典來儲存每個 t 時段的固定比率
    fixed_ratios = {}
    
    # 統計每個 t 時段是否固定在同一地點
    for t in weekends_data['t'].unique():
        t_data = weekends_data[weekends_data['t'] == t]
        #print("t_data=",t_data)
        # 計算這個時間段有多少週末的地點是相同的
        location_counts = t_data.groupby(['d', 'x', 'y']).size().unstack(fill_value=0)
        #print("location_counts=",location_counts)
        fixed_count = (location_counts > 0).sum(axis=0).max()  # 計算最大出現次數
        #print("fixed_count=",fixed_count)
        # 固定比率 = 出現次數 / 週末的總次數
        fixed_ratio = fixed_count / len(t_data['d'].unique())
        #print("fixed_ratio=",fixed_ratio)
        fixed_ratios[t] = fixed_ratio
    
    # 計算固定時間段的比例
    fixed_time_slots = sum(1 for ratio in fixed_ratios.values() if ratio >= threshold)
    total_time_slots = len(fixed_ratios)
    print("fixed_time_slots=",fixed_time_slots)
    print("total_time_slots=",total_time_slots)
    if total_time_slots == 0:
        return "regular"
    else:
        # 判斷是否為規律人
        if fixed_time_slots / total_time_slots > threshold:
            return "regular"
        else:
            return "irregular"
    

#     return nearest_popular_location
def home_weekend(user_data):
    df = df[(df['t'] <= 24) & (df['t'] >= 44)]
    location_counter = Counter(zip(df['x'], df['y']))
    return location_counter.most_common(1)
    

In [6]:
def home_60days(df,uid):
    df = df[df['uid'] == uid]
    df = df[df['d'] <= 59]
    # 選取 t 在 44 到 12 的時間段
    df_sleep = df[(df['t'] >= 44) | (df['t'] <= 12)]
    sleep_location_counter = Counter(zip(df_sleep['x'], df['y']))
    all_location_counter= Counter(zip(df['x'], df['y']))
    
    home_most_common = sleep_location_counter.most_common(1)
    all_most_common = all_location_counter.most_common(1)
    # 檢查是否找到家與工作地，避免出現空列表的情況
    if home_most_common:
        home = home_most_common[0][0]
    else:
        home = all_most_common[0][0]
    # 返回家的位置，選取最常去的地點作為家
    print("home=",home)
    return home
    
# 預計算所有 uid 的家位置並儲存
def calculate_all_homes(df):
    df=df[df['d']<=59]
    homes = {}
    for uid in df['uid'].unique():
        homes[uid] = home_60days(df, uid)  # 找出該 uid 的家
    return homes

# 一次性計算所有與 (home_x, home_y) 距離小於等於 n 的 uid
def find_uids_within_distance(homes_dict, home_x, home_y, n):
    uids_within_life_circle = []
    for uid, (user_home_x, user_home_y) in homes_dict.items():
        distance = calculate_distance(user_home_x, user_home_y, home_x, home_y)
        if distance <= n:
            uids_within_life_circle.append(uid)
    return uids_within_life_circle
# 篩選符合時間段和假日的數據
def filter_data_by_time(df, time_start, time_end):
    df=df[df['d']<=59]
    return df[(df['t'] >= time_start) & (df['t'] <= time_end) & df['d'].apply(is_weekend)]
# 優化後的熱門地點查找函數
def get_popular_locations_optimized(df, home_x, home_y, n, time_start, time_end, homes_dict):
    df=df[df['d']<=59]
    life_circle_uids_list = find_uids_within_distance(homes_dict, home_x, home_y, n)
    
    # 篩選符合時間段和假日的數據
    df_filtered = filter_data_by_time(df[df['uid'].isin(life_circle_uids_list)], time_start, time_end)

    # 計算熱門地點
    popular_locations = df_filtered.groupby(['x', 'y']).size().reset_index(name='count')
    if popular_locations.empty:
        return home_x, home_y  # 若無熱門地點，則返回家的位置

    # 計算比例和距離
    popular_locations['ratio'] = popular_locations['count'] / popular_locations['count'].sum()
    popular_locations['distance'] = popular_locations.apply(
        lambda row: calculate_distance(home_x, home_y, row['x'], row['y']), axis=1
    )

    # 排序並返回最熱門且最近的地點
    sorted_locations = popular_locations.sort_values(by=['ratio', 'distance'], ascending=[False, True])
    return sorted_locations.iloc[0][['x', 'y']].values

In [1]:
df = pd.read_csv('data/CityC Challenge Data.csv')
df = df[(df['uid'] >= 17000) & (df['uid'] <= 19999)]  # 取最後3000筆用戶
test_df = df[df['d'].apply(is_weekend)]  # 篩選出最後3000筆用戶的假日資料

uids = test_df['uid'].unique()

threshold = 0.5  # 規律性閾值
n = 30  # 生活圈範圍
# 預先計算所有 uid 的家位置
homes_dict = calculate_all_homes(df)
for uid in uids:
    user_data = test_df[(test_df['uid'] == uid)]  # 最後3000筆用戶中某用戶的假日資料
    
    # 判斷是否為規律人
    person_type = classify_person(user_data, threshold)
    # 找到該 uid 的家位置
    home = homes_dict[uid]
    home_x, home_y = home[0], home[1]
    
    if person_type == "regular":
        
        # 找出該用戶的通勤時間段
        commuting_periods = analyze_commuting_pattern(user_data)
        
        # 分割出早上、下午、晚上的通勤時段
        morning_commuting, afternoon_commuting, evening_commuting = split_time_slot(commuting_periods)
        
        morning_earliest = min(morning_commuting, default=12)
        morning_latest = max(morning_commuting, default=12)
        afternoon_earliest = min(afternoon_commuting, default=34)
        afternoon_latest = max(afternoon_commuting, default=34)
        evening_earliest = min(evening_commuting, default=44)
        evening_latest = max(evening_commuting, default=44)
        
        # 找出該用戶的常去地點 (p1, p2, p3)
        p1, p2, p3 = find_places(user_data, 
                                 morning_earliest, morning_latest, 
                                 afternoon_earliest, afternoon_latest, 
                                 evening_earliest, evening_latest, home_x, home_y)
        # 填充 60-74 日的資料
        for i, row in user_data[(user_data['x'] == 999) & (user_data['y'] == 999)].iterrows():
            if 60 <= row['d'] <= 74:
                
                # 早上到下午之間的工作時間段，填入 p1
                if morning_latest <= row['t'] < afternoon_earliest:
                    df.at[i, 'x'], df.at[i, 'y'] = p1
                
                # 下午到晚上之間的休閒時間段，填入 p2
                elif afternoon_latest <= row['t'] < evening_earliest:
                    df.at[i, 'x'], df.at[i, 'y'] = p2
            
                # 晚上到早上的時間段，填入 p3
                elif evening_latest <= row['t'] or row['t'] < morning_earliest:
                    df.at[i, 'x'], df.at[i, 'y'] = p3
                            
                elif morning_earliest <= row['t'] < morning_latest:
                    # p3到p1的通勤時間段
                    interpolated_location = linear_interpolate(p3, p1, row['t'], morning_earliest, morning_latest)
                    df.at[i, 'x'], df.at[i, 'y'] = interpolated_location
                    
                elif afternoon_earliest <= row['t'] < afternoon_latest:
                    # p1到p2的通勤時間段
                    interpolated_location = linear_interpolate(p1, p2, row['t'], afternoon_earliest, afternoon_latest)
                    df.at[i, 'x'], df.at[i, 'y'] = interpolated_location
                
                elif evening_earliest <= row['t'] < evening_latest:
                    # p2到p3的通勤時間段
                    interpolated_location = linear_interpolate(p2, p3, row['t'], evening_earliest, evening_latest)
                    df.at[i, 'x'], df.at[i, 'y'] = interpolated_location

    else:  # 不規律人
        # 遍歷指定日期範圍的數據進行預測
        for i, row in user_data[(user_data['x'] == 999) & (user_data['y'] == 999)].iterrows():
            if 60 <= row['d'] <= 74:
                if 24 <= row['t'] <= 34:
                    # 填充最熱門的下午地點
                    afternoon_location = get_popular_locations_optimized(df, home_x, home_y, n, 24, 34, homes_dict)
                    df.at[i, 'x'], df.at[i, 'y'] = afternoon_location
                elif 34 <= row['t'] <= 44:
                    # 填充最熱門的傍晚地點
                    evening_location = get_popular_locations_optimized(df, home_x, home_y, n, 34, 44, homes_dict)
                    df.at[i, 'x'], df.at[i, 'y'] = evening_location
                else:
                    # 填充家的位置
                    df.at[i, 'x'], df.at[i, 'y'] = home_x, home_y


# Step 4: 將結果保存成 CSV 檔案，保持 shape 不變
df.to_csv('cityC_weekend_fill7.csv', index=False)

print("預測結果已保存到 cityC_weekend_fill7.csv")