In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import preprocessing


In [2]:

# Load the CSV file into a DataFrame
orig_df = pd.read_csv('ds_clean/ds_clean.csv')
orig_df

Unnamed: 0,LocationCode,DateTime,WindSpeed(m/s),Pressure(hpa),Temperature(°C),Humidity(%),Sunlight(Lux),Power(mW)
0,10,2024-03-01 17:14:06,0.0,1017.48,15.59,94.30,652.92,0.12
1,10,2024-03-01 17:14:47,0.0,1017.48,15.66,94.04,682.50,0.12
2,10,2024-03-01 17:15:47,0.0,1017.47,15.74,94.10,750.00,0.14
3,10,2024-03-01 17:16:47,0.0,1017.46,15.78,94.09,738.33,0.14
4,10,2024-03-01 17:17:47,0.0,1017.49,15.80,94.08,660.83,0.12
...,...,...,...,...,...,...,...,...
1194888,9,2024-07-23 15:50:57,0.0,994.54,30.69,72.91,2288.33,1.10
1194889,9,2024-07-23 15:51:57,0.0,994.40,30.27,73.16,3236.67,1.92
1194890,9,2024-07-23 15:52:57,0.0,994.39,29.90,72.51,4526.67,3.57
1194891,9,2024-07-23 15:53:57,0.0,994.40,29.38,73.23,4231.67,3.13


In [3]:
# Convert the 'DateTime' column to datetime format
df = orig_df.copy()
df['DateTime'] = pd.to_datetime(df['DateTime'])

# Extract the date part and calculate the day of the year
df['DayOfYear'] = df['DateTime'].dt.dayofyear
# Calculate the number of minutes since the start of the day
df['MinutesSinceStartOfDay'] = df['DateTime'].dt.hour * 60 + df['DateTime'].dt.minute
# Display the updated DataFrame
df = df.drop(columns=['DateTime'])
df.to_csv('ds_clean/ds_clean_timeset.csv', index=False)

In [4]:
question = pd.read_csv('ds_clean/upload.csv')
question

Unnamed: 0,序號,答案
0,20240101090001,
1,20240101091001,
2,20240101092001,
3,20240101093001,
4,20240101094001,
...,...,...
9595,20240711161017,
9596,20240711162017,
9597,20240711163017,
9598,20240711164017,


In [5]:
# 将 '序號' 列转换为字符串
question['序號'] = question['序號'].astype(str)

# 提取 DayOfYear, MinutesSinceStartOfDay 和 LocationCode
question['DayOfYear'] = pd.to_datetime(question['序號'].str[:8], format='%Y%m%d').dt.dayofyear
question['MinutesSinceStartOfDay'] = question['序號'].str[8:12].apply(lambda x: int(x[:2]) * 60 + int(x[2:4]))
question['LocationCode'] = question['序號'].str[12:14].astype(int)

# 删除原来的 '序號' 列
question = question.drop(columns=['序號'])

# 显示更新后的 DataFrame
question.head(50)

Unnamed: 0,答案,DayOfYear,MinutesSinceStartOfDay,LocationCode
0,,1,540,1
1,,1,550,1
2,,1,560,1
3,,1,570,1
4,,1,580,1
5,,1,590,1
6,,1,600,1
7,,1,610,1
8,,1,620,1
9,,1,630,1


In [6]:
# 获取 question 的第 0 行数据
row = question.iloc[0]

# 提取 DayOfYear 和 MinutesSinceStartOfDay
day = row['DayOfYear']
minute = row['MinutesSinceStartOfDay']

# 初始化一个空的 DataFrame 来存储结果
feature_offset = pd.DataFrame()

# 定义一个函数来查找邻近的数据
def find_nearby_data(day, minute, df, feature_offset, max_records=10, min_days=0, min_minute=10):
    # 查找同一天同一分钟的数据
    same_day_same_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute)]
    same_day_same_minute = same_day_same_minute.copy() # Explicitly create a copy
    same_day_same_minute['DayOffset'] = 0
    same_day_same_minute['MinuteOffset'] = 0
    feature_offset = pd.concat([feature_offset, same_day_same_minute])
    
    # 如果找到的数据已经超过 max_records 条，且满足 min_days，返回结果
    if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
        return feature_offset
    
    # 查找同一天前后 60 分钟的数据
    for offset in range(1, 61):
        same_day_prev_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute - offset)]
        same_day_prev_minute = same_day_prev_minute.copy() # Explicitly create a copy
        same_day_prev_minute['DayOffset'] = 0
        same_day_prev_minute['MinuteOffset'] = -offset
        feature_offset = pd.concat([feature_offset, same_day_prev_minute])
        
        same_day_next_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute + offset)]
        same_day_next_minute = same_day_next_minute.copy() # Explicitly create a copy
        same_day_next_minute['DayOffset'] = 0
        same_day_next_minute['MinuteOffset'] = offset
        feature_offset = pd.concat([feature_offset, same_day_next_minute])
        
        if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
            return feature_offset
    
    # 查找前后几天的数据
    for day_offset in range(1, 366):
        prev_day = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute)]
        prev_day = prev_day.copy() # Explicitly create a copy
        prev_day['DayOffset'] = -day_offset
        prev_day['MinuteOffset'] = 0
        feature_offset = pd.concat([feature_offset, prev_day])
        
        next_day = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute)]
        next_day = next_day.copy() # Explicitly create a copy
        next_day['DayOffset'] = day_offset
        next_day['MinuteOffset'] = 0
        feature_offset = pd.concat([feature_offset, next_day])
        
        if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
            return feature_offset
        
        # 查找前后几天前后 60 分钟的数据
        for offset in range(1, 61):
            prev_day_prev_minute = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute - offset)]
            prev_day_prev_minute = prev_day_prev_minute.copy() # Explicitly create a copy
            prev_day_prev_minute['DayOffset'] = -day_offset
            prev_day_prev_minute['MinuteOffset'] = -offset
            feature_offset = pd.concat([feature_offset, prev_day_prev_minute])
            
            prev_day_next_minute = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute + offset)]
            prev_day_next_minute = prev_day_next_minute.copy() # Explicitly create a copy
            prev_day_next_minute['DayOffset'] = -day_offset
            prev_day_next_minute['MinuteOffset'] = offset
            feature_offset = pd.concat([feature_offset, prev_day_next_minute])
            
            next_day_prev_minute = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute - offset)]
            next_day_prev_minute = next_day_prev_minute.copy() # Explicitly create a copy
            next_day_prev_minute['DayOffset'] = day_offset
            next_day_prev_minute['MinuteOffset'] = -offset
            feature_offset = pd.concat([feature_offset, next_day_prev_minute])
            
            next_day_next_minute = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute + offset)]
            next_day_next_minute = next_day_next_minute.copy() # Explicitly create a copy
            next_day_next_minute['DayOffset'] = day_offset
            next_day_next_minute['MinuteOffset'] = offset
            feature_offset = pd.concat([feature_offset, next_day_next_minute])
            
            if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
                return feature_offset
    
    return feature_offset

# 查找邻近的数据
feature_offset = find_nearby_data(day, minute, df, feature_offset)
# 插入 [DayOffset = 0, MinuteOffset = 0, LocationCode = row 的 LocationCode]
# Display the feature_offset
selected_features = ['DayOffset','MinuteOffset','LocationCode']
feature_offset = feature_offset[selected_features]
new_row = pd.DataFrame({'DayOffset': [0], 'MinuteOffset': [0], 'LocationCode': [row['LocationCode']]})
feature_offset = pd.concat([feature_offset, new_row], ignore_index=True)

# Filter the feature_offset to include only the first 500 records
feature_offset = feature_offset.head(500)


# Display the updated feature_offset
feature_offset

Unnamed: 0,DayOffset,MinuteOffset,LocationCode
0,0,0,17.0
1,0,-1,17.0
2,0,-1,1.0
3,0,1,17.0
4,0,-2,17.0
5,0,-2,1.0
6,0,2,17.0
7,0,-3,17.0
8,0,-3,1.0
9,0,3,17.0


In [9]:
# 获取 question 的第 0 行数据
def find_nearby_data(day, minute, df, feature_offset, max_records=0, min_days=0, min_minute=10):
    # 查找同一天同一分钟的数据
    same_day_same_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute)]
    same_day_same_minute = same_day_same_minute.copy() # Explicitly create a copy
    same_day_same_minute['DayOffset'] = 0
    same_day_same_minute['MinuteOffset'] = 0
    feature_offset = pd.concat([feature_offset, same_day_same_minute])
    
    # 如果找到的数据已经超过 max_records 条，且满足 min_days，返回结果
    if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
        return feature_offset
    
    # 查找同一天前后 60 分钟的数据
    for offset in range(1, 61):
        same_day_prev_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute - offset)]
        same_day_prev_minute = same_day_prev_minute.copy() # Explicitly create a copy
        same_day_prev_minute['DayOffset'] = 0
        same_day_prev_minute['MinuteOffset'] = -offset
        feature_offset = pd.concat([feature_offset, same_day_prev_minute])
        
        same_day_next_minute = df[(df['DayOfYear'] == day) & (df['MinutesSinceStartOfDay'] == minute + offset)]
        same_day_next_minute = same_day_next_minute.copy() # Explicitly create a copy
        same_day_next_minute['DayOffset'] = 0
        same_day_next_minute['MinuteOffset'] = offset
        feature_offset = pd.concat([feature_offset, same_day_next_minute])
        
        if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
            return feature_offset
    
    # 查找前后几天的数据
    for day_offset in range(1, 366):
        prev_day = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute)]
        prev_day = prev_day.copy() # Explicitly create a copy
        prev_day['DayOffset'] = -day_offset
        prev_day['MinuteOffset'] = 0
        feature_offset = pd.concat([feature_offset, prev_day])
        
        next_day = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute)]
        next_day = next_day.copy() # Explicitly create a copy
        next_day['DayOffset'] = day_offset
        next_day['MinuteOffset'] = 0
        feature_offset = pd.concat([feature_offset, next_day])
        
        if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
            return feature_offset
        
        # 查找前后几天前后 60 分钟的数据
        for offset in range(1, 61):
            prev_day_prev_minute = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute - offset)]
            prev_day_prev_minute = prev_day_prev_minute.copy() # Explicitly create a copy
            prev_day_prev_minute['DayOffset'] = -day_offset
            prev_day_prev_minute['MinuteOffset'] = -offset
            feature_offset = pd.concat([feature_offset, prev_day_prev_minute])
            
            prev_day_next_minute = df[(df['DayOfYear'] == day - day_offset) & (df['MinutesSinceStartOfDay'] == minute + offset)]
            prev_day_next_minute = prev_day_next_minute.copy() # Explicitly create a copy
            prev_day_next_minute['DayOffset'] = -day_offset
            prev_day_next_minute['MinuteOffset'] = offset
            feature_offset = pd.concat([feature_offset, prev_day_next_minute])
            
            next_day_prev_minute = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute - offset)]
            next_day_prev_minute = next_day_prev_minute.copy() # Explicitly create a copy
            next_day_prev_minute['DayOffset'] = day_offset
            next_day_prev_minute['MinuteOffset'] = -offset
            feature_offset = pd.concat([feature_offset, next_day_prev_minute])
            
            next_day_next_minute = df[(df['DayOfYear'] == day + day_offset) & (df['MinutesSinceStartOfDay'] == minute + offset)]
            next_day_next_minute = next_day_next_minute.copy() # Explicitly create a copy
            next_day_next_minute['DayOffset'] = day_offset
            next_day_next_minute['MinuteOffset'] = offset
            feature_offset = pd.concat([feature_offset, next_day_next_minute])
            
            if len(feature_offset) >= max_records and (feature_offset['DayOffset'].abs().max() >= min_days) and (feature_offset['MinuteOffset'].abs().max() >= min_minute):
                return feature_offset
    
    return feature_offset

number_of_nearby_locations = [] 
for i in range(len(question)):

    row = question.iloc[i]

    # 提取 DayOfYear 和 MinutesSinceStartOfDay
    day = row['DayOfYear']
    minute = row['MinutesSinceStartOfDay']

    # 初始化一个空的 DataFrame 来存储结果
    feature_offset = pd.DataFrame()

    # 定义一个函数来查找邻近的数据

    # 查找邻近的数据
    feature_offset = find_nearby_data(day, minute, df, feature_offset)
    # 插入 [DayOffset = 0, MinuteOffset = 0, LocationCode = row 的 LocationCode]
    unique_locations = feature_offset['LocationCode'].unique()
    number_of_nearby_locations.append(len(unique_locations))
    # Display the feature_offset



# Display the updated feature_offset
number_of_nearby_locations

[2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 4,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,


In [11]:
number_of_nearby_location_np = np.array(number_of_nearby_locations)

In [17]:
np.median(number_of_nearby_location_np)

8.0

In [59]:
# data_array_np = df[['LocationCode', 'DayOfYear', 'MinutesSinceStartOfDay', 'Power(mW)']].to_numpy()
# condition = feature_offset[['LocationCode', 'DayOffset', 'MinuteOffset']].to_numpy()
# unique_day_minute_combinations = df[['DayOfYear', 'MinutesSinceStartOfDay']].drop_duplicates().to_numpy()

# condition_array = []
# for i in range(unique_day_minute_combinations.shape[0]):
#     condition_cp = condition.copy()
#     condition_cp[: , 1:3] += unique_day_minute_combinations[i]
#     condition_array.append(condition_cp)


In [None]:
# mask_array = [np.isin(data_array_np[:, :3], condition_array[i]).all(axis=1) for i in range(len(condition_array))]

In [None]:
# masked_result = [data_array_np[mask_array[i], 3] for i in range(len(mask_array))]

In [None]:

# def generate_training_data(df, feature_offset, max_records=500):
#     """
#     Generate training data by using the provided feature_offset as input.
#     - df: DataFrame, the full dataset
#     - feature_offset: DataFrame, specifying DayOffset, MinuteOffset, and LocationCode
#     - max_records: int, maximum number of records to include for each sample

#     Returns:
#     - train_set: DataFrame with generated features
#     """
#     train_set = pd.DataFrame()

#     for _, row in feature_offset.iterrows():
#         day = row['DayOffset'] + df['DayOfYear'].iloc[0]
#         minute = row['MinuteOffset'] + df['MinutesSinceStartOfDay'].iloc[0]
#         location_code = row['LocationCode']

#         # Filter matching rows in the main dataset
#         matching_rows = df[(df['DayOfYear'] == day) &
#                            (df['MinutesSinceStartOfDay'] == minute) &
#                            (df['LocationCode'] == location_code)]

#         if not matching_rows.empty:
#             # Add matching rows to the training set
#             train_set = pd.concat([train_set, matching_rows], ignore_index=True)

#         # Limit the size of the training set
#         if len(train_set) >= max_records:
#             break

#     return train_set

# # Generate training data
# train_set = generate_training_data(df, feature_offset)
# train_set

In [7]:
# # 初始化一个空的 DataFrame 来存储结果
# train_set = pd.DataFrame()

# # 遍历 feature_offset 的每一行
# for index, row in feature_offset.iterrows():
#     # 根据 DayOffset, MinuteOffset 和 LocationCode 从 df 中查找匹配的行
#     matching_rows = df[(df['DayOfYear'] == (day + row['DayOffset'])) &
#                        (df['MinutesSinceStartOfDay'] == (minute + row['MinuteOffset'])) &
#                        (df['LocationCode'] == row['LocationCode'])]
    
#     # 如果找到匹配的行，将其 Power(mW) 值添加到 train_set 中
#     if not matching_rows.empty:
#         train_set = pd.concat([train_set, matching_rows[['Power(mW)']]], ignore_index=True)

# # 将 train_set 的列名设置为 feature_offset 的行数
# train_set.columns = [f'Feature_{i}' for i in range(train_set.shape[1])]

# # 显示 train_set
# train_set

Unnamed: 0,Feature_0
0,7.08
1,7.04
2,14.3
3,7.13
4,6.51
5,16.34
6,7.25
7,6.01
8,17.92
9,7.06
