In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import preprocessing


In [31]:

# Load the CSV file into a DataFrame
df = pd.read_csv('ds_clean/ds_clean.csv')
df

Unnamed: 0,LocationCode,DateTime,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW)
0,10,2024-03-01 17:14:06,0.0,1017.48,15.59,94.30,652.92,0.12
1,10,2024-03-01 17:14:47,0.0,1017.48,15.66,94.04,682.50,0.12
2,10,2024-03-01 17:15:47,0.0,1017.47,15.74,94.10,750.00,0.14
3,10,2024-03-01 17:16:47,0.0,1017.46,15.78,94.09,738.33,0.14
4,10,2024-03-01 17:17:47,0.0,1017.49,15.80,94.08,660.83,0.12
...,...,...,...,...,...,...,...,...
1194888,9,2024-07-23 15:50:57,0.0,994.54,30.69,72.91,2288.33,1.10
1194889,9,2024-07-23 15:51:57,0.0,994.40,30.27,73.16,3236.67,1.92
1194890,9,2024-07-23 15:52:57,0.0,994.39,29.90,72.51,4526.67,3.57
1194891,9,2024-07-23 15:53:57,0.0,994.40,29.38,73.23,4231.67,3.13


In [32]:
# Convert the 'DateTime' column to datetime format
df['DateTime'] = pd.to_datetime(df['DateTime'])

# Extract the date part and calculate the day of the year
df['DayOfYear'] = df['DateTime'].dt.dayofyear
# Calculate the number of minutes since the start of the day
df['MinutesSinceStartOfDay'] = df['DateTime'].dt.hour * 60 + df['DateTime'].dt.minute
# Display the updated DataFrame
df = df.drop(columns=['DateTime'])
df

Unnamed: 0,LocationCode,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW),DayOfYear,MinutesSinceStartOfDay
0,10,0.0,1017.48,15.59,94.30,652.92,0.12,61,1034
1,10,0.0,1017.48,15.66,94.04,682.50,0.12,61,1034
2,10,0.0,1017.47,15.74,94.10,750.00,0.14,61,1035
3,10,0.0,1017.46,15.78,94.09,738.33,0.14,61,1036
4,10,0.0,1017.49,15.80,94.08,660.83,0.12,61,1037
...,...,...,...,...,...,...,...,...,...
1194888,9,0.0,994.54,30.69,72.91,2288.33,1.10,205,950
1194889,9,0.0,994.40,30.27,73.16,3236.67,1.92,205,951
1194890,9,0.0,994.39,29.90,72.51,4526.67,3.57,205,952
1194891,9,0.0,994.40,29.38,73.23,4231.67,3.13,205,953


In [33]:
question = pd.read_csv('ds_clean/upload.csv')
question

Unnamed: 0,序號,答案
0,20240101090001,
1,20240101091001,
2,20240101092001,
3,20240101093001,
4,20240101094001,
...,...,...
9595,20240711161017,
9596,20240711162017,
9597,20240711163017,
9598,20240711164017,


In [34]:
# 将 '序號' 列转换为字符串
question['序號'] = question['序號'].astype(str)

# 提取 DayOfYear, MinutesSinceStartOfDay 和 LocationCode
question['DayOfYear'] = pd.to_datetime(question['序號'].str[:8], format='%Y%m%d').dt.dayofyear
question['MinutesSinceStartOfDay'] = question['序號'].str[8:12].apply(lambda x: int(x[:2]) * 60 + int(x[2:4]))
question['LocationCode'] = question['序號'].str[12:14].astype(int)

# 删除原来的 '序號' 列
question = question.drop(columns=['序號'])

# 显示更新后的 DataFrame
question

Unnamed: 0,答案,DayOfYear,MinutesSinceStartOfDay,LocationCode
0,,1,540,1
1,,1,550,1
2,,1,560,1
3,,1,570,1
4,,1,580,1
...,...,...,...,...
9595,,193,970,17
9596,,193,980,17
9597,,193,990,17
9598,,193,1000,17


In [35]:
# 获取 question 的第 0 行数据
row = question.iloc[0]

# 提取 DayOfYear 和 MinutesSinceStartOfDay
day = row['DayOfYear']
minute = row['MinutesSinceStartOfDay']

# 初始化一个空的 DataFrame 来存储结果
feature_offset = pd.DataFrame()

# 定义一个函数来查找邻近的数据
def find_nearby_data(day, minute, df, feature_offset, max_records=180, min_days=2):
    # 查找同一天同一分钟的数据
    same_day_same_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute)]
    same_day_same_minute = same_day_same_minute.copy() # Explicitly create a copy
    same_day_same_minute['DayOffset'] = 0
    same_day_same_minute['MinuteOffset'] = 0
    feature_offset = pd.concat([feature_offset, same_day_same_minute])
    
    # 如果找到的数据已经超过 max_records 条，且满足 min_days，返回结果
    if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days):
        return feature_offset
    
    # 查找同一天前后 60 分钟的数据
    for offset in range(1, 61):
        same_day_prev_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute - offset)]
        same_day_prev_minute = same_day_prev_minute.copy() # Explicitly create a copy
        same_day_prev_minute['DayOffset'] = 0
        same_day_prev_minute['MinuteOffset'] = -offset
        feature_offset = pd.concat([feature_offset, same_day_prev_minute])
        
        same_day_next_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute + offset)]
        same_day_next_minute = same_day_next_minute.copy() # Explicitly create a copy
        same_day_next_minute['DayOffset'] = 0
        same_day_next_minute['MinuteOffset'] = offset
        feature_offset = pd.concat([feature_offset, same_day_next_minute])
        
        if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days):
            return feature_offset
    
    # 查找前后几天的数据
    for day_offset in range(1, 366):
        prev_day = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute)]
        prev_day = prev_day.copy() # Explicitly create a copy
        prev_day['DayOffset'] = -day_offset
        prev_day['MinuteOffset'] = 0
        feature_offset = pd.concat([feature_offset, prev_day])
        
        next_day = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute)]
        next_day = next_day.copy() # Explicitly create a copy
        next_day['DayOffset'] = day_offset
        next_day['MinuteOffset'] = 0
        feature_offset = pd.concat([feature_offset, next_day])
        
        if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days):
            return feature_offset
        
        # 查找前后几天前后 60 分钟的数据
        for offset in range(1, 61):
            prev_day_prev_minute = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute - offset)]
            prev_day_prev_minute = prev_day_prev_minute.copy() # Explicitly create a copy
            prev_day_prev_minute['DayOffset'] = -day_offset
            prev_day_prev_minute['MinuteOffset'] = -offset
            feature_offset = pd.concat([feature_offset, prev_day_prev_minute])
            
            prev_day_next_minute = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute + offset)]
            prev_day_next_minute = prev_day_next_minute.copy() # Explicitly create a copy
            prev_day_next_minute['DayOffset'] = -day_offset
            prev_day_next_minute['MinuteOffset'] = offset
            feature_offset = pd.concat([feature_offset, prev_day_next_minute])
            
            next_day_prev_minute = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute - offset)]
            next_day_prev_minute = next_day_prev_minute.copy() # Explicitly create a copy
            next_day_prev_minute['DayOffset'] = day_offset
            next_day_prev_minute['MinuteOffset'] = -offset
            feature_offset = pd.concat([feature_offset, next_day_prev_minute])
            
            next_day_next_minute = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute + offset)]
            next_day_next_minute = next_day_next_minute.copy() # Explicitly create a copy
            next_day_next_minute['DayOffset'] = day_offset
            next_day_next_minute['MinuteOffset'] = offset
            feature_offset = pd.concat([feature_offset, next_day_next_minute])
            
            if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days):
                return feature_offset
    
    return feature_offset

# 查找邻近的数据
feature_offset = find_nearby_data(day, minute, df, feature_offset)
# 插入 [DayOffset = 0, MinuteOffset = 0, LocationCode = row 的 LocationCode]
# Display the feature_offset
selected_features = ['DayOffset','MinuteOffset','LocationCode']
feature_offset = feature_offset[selected_features]
new_row = pd.DataFrame({'DayOffset': [0], 'MinuteOffset': [0], 'LocationCode': [row['LocationCode']]})
feature_offset = pd.concat([feature_offset, new_row], ignore_index=True)

# Filter the feature_offset to include only the first 500 records
feature_offset = feature_offset.head(500)


# Display the updated feature_offset
feature_offset

Unnamed: 0,DayOffset,MinuteOffset,LocationCode
0,0,0,17.0
1,0,-1,17.0
2,0,-1,1.0
3,0,1,17.0
4,0,-2,17.0
...,...,...,...
420,1,60,17.0
421,1,60,1.0
422,2,0,17.0
423,2,0,1.0


In [36]:
# 初始化一个空的 DataFrame 来存储结果
train_set = pd.DataFrame()

# 遍历 feature_offset 的每一行
for index, row in feature_offset.iterrows():
    # 根据 DayOffset, MinuteOffset 和 LocationCode 从 df 中查找匹配的行
    matching_rows = df[(df['DayOfYear'] == (day + row['DayOffset'])) &
                       (df['MinutesSinceStartOfDay'] == (minute + row['MinuteOffset'])) &
                       (df['LocationCode'] == row['LocationCode'])]
    
    # 如果找到匹配的行，将其 Power(mW) 值添加到 train_set 中
    if not matching_rows.empty:
        train_set = pd.concat([train_set, matching_rows[['Power(mW)']]], ignore_index=True)

# 将 train_set 的列名设置为 feature_offset 的行数
train_set.columns = [f'Feature_{i}' for i in range(train_set.shape[1])]

# 显示 train_set
train_set

Unnamed: 0,Feature_0
0,7.08
1,7.04
2,14.30
3,7.13
4,6.51
...,...
419,460.88
420,1609.00
421,1649.86
422,86.53
