In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import data

In [25]:
# Read every csv file in folder data
df_Aoti = pd.read_csv('data/Aoti.csv')
df_Dingling = pd.read_csv('data/Dingling.csv')
df_Guanyuan = pd.read_csv('data/Guanyuan.csv')
df_Gucheng = pd.read_csv('data/Gucheng.csv')
df_Nongzhanguan = pd.read_csv('data/Nongzhanguan.csv')
df_Tiantan = pd.read_csv('data/Tiantan.csv')
df_Wanliu = pd.read_csv('data/Wanliu.csv')
df_Wanshouxinggong = pd.read_csv('data/Wanshouxinggong.csv')

# Print the first 5 rows of each dataframe
print(df_Aoti.head())
print(df_Dingling.head())
print(df_Guanyuan.head())
print(df_Gucheng.head())
print(df_Nongzhanguan.head())
print(df_Tiantan.head())
print(df_Wanliu.head())
print(df_Wanshouxinggong.head())



   No  year  month  day  hour  PM2.5  PM10   SO2   NO2     CO    O3  TEMP  \
0   1  2013      3    1     0    4.0   4.0   4.0   7.0  300.0  77.0  -0.7   
1   2  2013      3    1     1    8.0   8.0   4.0   7.0  300.0  77.0  -1.1   
2   3  2013      3    1     2    7.0   7.0   5.0  10.0  300.0  73.0  -1.1   
3   4  2013      3    1     3    6.0   6.0  11.0  11.0  300.0  72.0  -1.4   
4   5  2013      3    1     4    3.0   3.0  12.0  12.0  300.0  72.0  -2.0   

     PRES  DEWP  RAIN   wd  WSPM       station  
0  1023.0 -18.8   0.0  NNW   4.4  Aotizhongxin  
1  1023.2 -18.2   0.0    N   4.7  Aotizhongxin  
2  1023.5 -18.2   0.0  NNW   5.6  Aotizhongxin  
3  1024.5 -19.4   0.0   NW   3.1  Aotizhongxin  
4  1025.2 -19.5   0.0    N   2.0  Aotizhongxin  
   No  year  month  day  hour  PM2.5  PM10  SO2  NO2     CO    O3  TEMP  \
0   1  2013      3    1     0    4.0   4.0  3.0  NaN  200.0  82.0  -2.3   
1   2  2013      3    1     1    7.0   7.0  3.0  NaN  200.0  80.0  -2.5   
2   3  2013      3

In [26]:
# Set all the dataframes into a list
df_list = [df_Aoti, df_Dingling, df_Guanyuan, df_Gucheng, df_Nongzhanguan, df_Tiantan, df_Wanliu, df_Wanshouxinggong]

# Print the item in the list


In [27]:



import math

def gaussian_distance(lat1, lon1, lat2, lon2):
    """
    Function:
    gaussian_distance(lat1, lon1, lat2, lon2)
    Description:
    Use the Haversine formula to calculate the distance between two points on the Earth's surface
    Input: lat1, lon1, lat2, lon2
    Output: distance
    """

    R = 6371  # radius of Earth in km
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    delta_lat = lat2_rad - lat1_rad
    delta_lon = lon2_rad - lon1_rad

    a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c

def calculate_weight_list(target_station, station_list):
    """
    Function: 

    calculate_weight_list(target_station, station_list)

    Description:
    Addressing spatial dependence using a Gauss vector weight function

    Input: target_station: the (lat, lon) of the target station
            station_list: the list of all stations‘ (lat, lon)
    Output: weight_list: the list of all stations' name and weight
    """

    sigma = 1.0  # this is the standard deviation, and you can change it based on your needs
    target_lat, target_lon = target_station[1], target_station[2]
    weight_list = []

    for station in station_list:
        station_name, station_lat, station_lon,station_id = station
        # calculate the Euclidean distance
        distance = gaussian_distance(target_lat, target_lon, station_lat, station_lon)
        # calculate the weight based on the Gaussian function
        weight = math.exp(-distance**2 / (2 * sigma**2))
        weight_list.append((station_id,station_name, weight))

    return weight_list


In [28]:
# #Test the function
# target_station = (39.982, 116.397)
# station_list = [(39.982, 116.397), (39.983, 116.397), (39.981, 116.397), (39.982, 116.396), (39.982, 116.398)]
# weight_list = calculate_weight_list(target_station, station_list)
# print(weight_list)

# # Another testcase for the function
# target_station = (39.982, 116.397)
# station_list = [(39.982, 116.397), (39.983, 116.397), (39.989, 116.397), (37.982, 116.396), (39.982, 126.398)]
# weight_list = calculate_weight_list(target_station, station_list)
# print(weight_list)

In [29]:
#define the location of real station
station_location = {
    'Wanshouxinggong': ("Wanshouxinggong",40.6894930, 117.1580,1),
    'Dingling': ("Dingling",40.2865, 116.434,2),
    'Gucheng': ("Gucheng",39.9136, 116.184,3),
    'Wanliu': ("Wanliu",39.9611, 116.2878,4),
    'Aoti': ("Aoti",40.0031, 116.401,5),
    'Nongzhanguan': ("Nongzhanguan",39.9716, 116.473,6),
    'Guanyuan': ("Guanyuan",39.9425, 116.361,7),
    'Tiantan': ("Tiantan",39.8745, 116.434,8)
    }

In [30]:
def sort_by_weight(weight_list):
    # Sort a list of (station_id, station_name, weight) tuples by weight in descending order
    return sorted(weight_list, key=lambda x: x[2], reverse=True)

# Calculate the weight of each station and sort the results
weight_Dingling = sort_by_weight(calculate_weight_list(station_location['Dingling'], list(station_location.values())))
weight_Guanyuan = sort_by_weight(calculate_weight_list(station_location['Guanyuan'], list(station_location.values())))
weight_Nongzhanguan = sort_by_weight(calculate_weight_list(station_location['Nongzhanguan'], list(station_location.values())))
weight_Wanliu = sort_by_weight(calculate_weight_list(station_location['Wanliu'], list(station_location.values())))
weight_Wanshouxinggong = sort_by_weight(calculate_weight_list(station_location['Wanshouxinggong'], list(station_location.values())))
weight_Aoti = sort_by_weight(calculate_weight_list(station_location['Aoti'], list(station_location.values())))
weight_Gucheng = sort_by_weight(calculate_weight_list(station_location['Gucheng'], list(station_location.values())))
weight_Tiantan = sort_by_weight(calculate_weight_list(station_location['Tiantan'], list(station_location.values())))

# Put the sorted weights into a DataFrame
weight_df = pd.DataFrame({  'Dingling': weight_Dingling,
                            'Wanshouxinggong': weight_Wanshouxinggong,
                            'Gucheng': weight_Gucheng,
                            'Wanliu': weight_Wanliu, 
                            'Aoti': weight_Aoti,
                            'Nongzhanguan': weight_Nongzhanguan,
                            'Guanyuan': weight_Guanyuan,
                            'Tiantan': weight_Tiantan
                            })

print(weight_df["Aoti"])


0                              (5, Aoti, 1.0)
1    (6, Nongzhanguan, 1.463036851314622e-11)
2        (7, Guanyuan, 4.142673330372584e-13)
3         (4, Wanliu, 1.1583060371836145e-25)
4         (8, Tiantan, 7.564469549484409e-47)
5        (3, Gucheng, 1.6292323680733375e-96)
6       (2, Dingling, 4.510073874439477e-218)
7                   (1, Wanshouxinggong, 0.0)
Name: Aoti, dtype: object


#Address the space

In [31]:
def filter_and_return_df(station_name, weight_df, threshold=1e-50):
    """
    Function: filter_and_return_df(station_name, weight_df, station_location, threshold=1e50)
    
    Description: 
    This function filters the stations by weight in the given station column and
    reads corresponding csv files for those stations with weight above the given threshold.
    It also adds a 'distance' column to the data, representing the distance between the target station and
    each of the other stations.
    Then it concatenates all the read data into a DataFrame and returns it.
    
    Input: 
    station_name: the name of the station column to look at in weight_df
    weight_df: the DataFrame containing station weights
    threshold: the weight threshold for filtering stations, default to 1e-50
    
    Output: 
    df: the concatenated DataFrame containing data read from csv files of the filtered stations
    """

    # Get a list of tuples for the given station
    station_list = weight_df[station_name].tolist()

    # Filter the station_list based on the given threshold
    filtered_stations = [station for station in station_list if station[2] > threshold]

    # Read corresponding csv files for the filtered stations
    data_list = []
    for station in filtered_stations:
        # Load the csv data
        data = pd.read_csv(f'./data/{station[1]}.csv')

        # Calculate the distance to the target station
        lat1, lon1 = station_location[station_name][1], station_location[station_name][2]
        lat2, lon2 = station_location[station[1]][1], station_location[station[1]][2]
        distance = gaussian_distance(lat1, lon1, lat2, lon2)

        # Add the distance column to the data
        data['distance'] = distance

        data_list.append(data)

    # Concatenate all the data into a DataFrame
    df = pd.concat(data_list, ignore_index=True)

    target_df = df[df['distance'] != station_name]


    return target_df


In [32]:
import os

# Create the directory if it doesn't exist
if not os.path.exists('preprocess_data'):
    os.makedirs('preprocess_data')

# For each station
for station_name in station_location.keys():
    # Calculate the df for this station
    df = filter_and_return_df(station_name, weight_df, threshold=1e-100)
    
    # Save the df as a csv file in the 'preprocess_data' directory
    df.to_csv(f'preprocess_data/{station_name}.csv', index=False)

