In [2]:
import xarray as xr
from azure.storage.blob import BlobServiceClient
import zarr
import pandas as pd
from dotenv import load_dotenv
import os
from datetime import datetime, timedelta
import numpy as np

load_dotenv()
storage_account_name = os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
sas_token = os.getenv('AZURE_STORAGE_SAS_TOKEN')
container_name = os.getenv('CONTAINER_NAME')

azure_url=os.getenv('AZURE_URL')

pd.set_option('display.max_columns', None)
file_path_pre=""

#### 1.Helper Functions
1. to read data from azure blob:
    read_zarr_from_blob(account_name, container_name, blob_name, sas_token)
2. to generate a list of date in yyyymmdd format for reading a single day's blob file:
    generateTimeStamp(start_month, end_month)


In [3]:
def read_zarr_from_blob(account_name, container_name, blob_name, sas_token)-> xr:
    """
        account_name : Azure Account Name
        container_name: Azure Container Name
        blob_name: Azure Blob Name
        sas_token: SAS token for certain blob 
        return: an xarray dataset
    """
    blob_service_client = BlobServiceClient(account_url=f"https://{account_name}.blob.core.windows.net", credential=sas_token)
    container_client = blob_service_client.get_container_client(container_name)
    zarr_store = zarr.ABSStore(prefix=blob_name, client=container_client)
    ds = xr.open_zarr(zarr_store)
    return ds

def generateTimeStamp(year:int, start_month, end_month) -> list:
    """
        year: int(yyyy)
        start_month: int(m)
        end_month: int(m)
        return : list of datetime in format: str("yyyymmdd")
    """
    datetime_list = []
    if int(year)%4!=0:
        invalid_feb_days = ["29","30","31"]
    else:
        invalid_feb_days = ["30","31"]
    
    for i in range(start_month, end_month + 1):
        month = "0" + str(i)
        for j in range(1, 31):
            if 1 <= j < 10:
                day = "0" + str(j)
            else:
                day = str(j)
            if month in ('04','06','09','11') and day == "31" or  (month=='02' and day in invalid_feb_days):
                continue

            datetime = str(year) + month + day
            datetime_list.append(datetime)
    return datetime_list

#### 2. Read the NLDN data

1. read the original NLDN data: 
    get_NLDN_lightning_Data(start_date, end_date)
2. create lightning_mean, lightning_sum in 0.25 spatial degree: 
    get_lightning_sum_per_day()

In [4]:
# get NLDN lightning data for CONUS
"""
    start_date: "yyyy-mm-dd"
    end_date: "yyyy-mm-dd"
    return : xarray dataset of NLDN
"""
def get_NLDN_lightning_Data(start_date:str, end_date:str) ->xr:
    blob_file_name = "data/lightning.zarr"
    nldn_ds = read_zarr_from_blob(storage_account_name, container_name,blob_file_name, sas_token)
    ds = nldn_ds.sel(
            latitude=slice(50, 25),
            longitude=slice(235, 295),
            time=slice(start_date, end_date)) 
    return ds

# get the sum of lightning count for each day across the CONUS for the date range
def get_lightning_sum_per_day(start_date:str, end_date:str) ->xr:
    """
        start_date: "yyyy-mm-dd"
        end_date: "yyyy-mm-dd"
        return nldn dataset with lightning sum and lightning mean
    """
    total_nldn_ds = get_NLDN_lightning_Data(start_date,end_date)
    sum_lightning = total_nldn_ds.lightning.sum(dim=['latitude','longitude']).compute()
    mean_lightning = total_nldn_ds.lightning.mean(dim=['latitude','longitude']).compute()
    total_nldn_ds['lightning_sum'] = sum_lightning
    total_nldn_ds['lightning_mean'] = mean_lightning
    return total_nldn_ds


#### 3. Read the SHSR data
1. read the original SHSR data, in 2 min time window, and 0.02 spatial degree:
    get_total_SHSR_ds(year:int, start_month:int, end_month:int)
2. resample the data into 1-Hr time window
    simplify_time_range_for_SHSR(ds:xr)
3. resample the data into 0.25 spatial degree by coarsen, modify the latitude/longitude round it to 0.25 degree with "linear":
    get_max_date_SHSR_in_hr_range_and_quarter_degree_range_result(ds:xr)

In [5]:
# get all the SHSR radar dateset from date range(2022.07.01 - 2022.08.31)
def get_total_SHSR_ds(year:int, start_month:int, end_month:int)->xr:
    """
        year: int(yyyy)
        start_month: int(m)
        end_month: int(m)
        return : an xarray dataset of SHSR data from start_month to end_month in the same year
    """
    datetime_list = generateTimeStamp(year, start_month, end_month)
    concatenated_list = []
    for datetime in datetime_list:
        blob_file_name = "data/SHSR/" + datetime + ".zarr"
        current_ds = read_zarr_from_blob(
            storage_account_name, container_name, blob_file_name,sas_token
        )
        concatenated_list.append(current_ds)

    concatenated_ds = xr.concat(concatenated_list, dim="time")
    return concatenated_ds

def simplify_time_range_for_SHSR(ds:xr) ->xr:
    """
        input ds is the original merged dataset including all the SHSR info in a certain time window
        the time slice for each record is 2mins
        this function resample the time slice into 1 Hr window, 
        only filter the positive SHSR value, to ensure the mean/std/sum/ct are still valid after aggregation
    """
    ds_lightning_positive = ds.where(ds["SHSR"] > 0)
    ds_resampled_mean = ds_lightning_positive.resample(time="1H").mean().rename({"SHSR": "SHSR_mean"})
    ds_resampled_std = ds_lightning_positive.resample(time="1H").std().rename({"SHSR": "SHSR_std"}) 
    # ds_resampled_sum = ds_lightning_positive.resample(time="1H").sum().rename({"SHSR": "SHSR_sum"})
    ds_resampled_ct = ds_lightning_positive.resample(time="1H").count().rename({"SHSR": "SHSR_ct"})
    ds_resampled_diff = (ds_lightning_positive.resample(time="1H").max()- ds_lightning_positive.resample(time="1H").min()).rename({"SHSR": "SHSR_diff"})

    ds_shsr_above_30_ct = ds_lightning_positive.where(ds_lightning_positive["SHSR"] > 30)
    ds_resampled_above_30_ct = ds_shsr_above_30_ct.resample(time="1H").count().rename({"SHSR": "SHSR_above_30_ct"})
    
    ds_shsr_above_40_ct = ds_lightning_positive.where(ds_lightning_positive["SHSR"] > 40)
    ds_resampled_above_40_ct = ds_shsr_above_40_ct.resample(time="1H").count().rename({"SHSR": "SHSR_above_40_ct"})
    
    ds_shsr_above_50_ct = ds_lightning_positive.where(ds_lightning_positive["SHSR"] > 50)
    ds_resampled_above_50_ct = ds_shsr_above_50_ct.resample(time="1H").count().rename({"SHSR": "SHSR_above_50_ct"})
    
    

    ds_merged = xr.merge([
            ds_resampled_mean,
            ds_resampled_std,
            ds_resampled_ct,
            ds_resampled_diff,
            ds_resampled_above_30_ct,ds_resampled_above_40_ct,ds_resampled_above_50_ct])

    return ds_merged


def get_total_SHSR_ds_in_hr_range(year:int, start_month:int, end_month:int)->xr:
    """ 
        this function set up to stagely store the SHSR dataset in Hourly time window
    """
    original_ds = get_total_SHSR_ds(year,start_month,end_month)
    # original_ds = get_total_SHSR_ds(2022,7,8)
    ds = simplify_time_range_for_SHSR(original_ds)
    return ds

def get_max_date_SHSR_in_hr_range_and_quarter_degree_range(year:int, start_month:int, end_month:int, expend_ratio:int):
    """ 
        this function set up to aggreate the geological information from 0.2 degree to 0.25 degree
        mainly using the coarsen function, with a "trim" boundary to reduce the overlap region of each new grid(0.25*0.25)
        
    """
    ds_interp = get_total_SHSR_ds_in_hr_range(year,start_month,end_month)
    # in our case the expend ration should be 13
    coarsen_factor = {'latitude': expend_ratio, 'longitude': expend_ratio}
    # Avg
    ds_coarsened_avg = ds_interp.coarsen(dim=coarsen_factor, boundary='trim').mean()
    new_var_names_avg = {var_name: f"avg_{var_name}" for var_name in ds_coarsened_avg.data_vars}
    ds_coarsened_avg = ds_coarsened_avg.rename(new_var_names_avg)
    
    # Sum
    ds_coarsened_sum = ds_interp.coarsen(dim=coarsen_factor, boundary='trim').sum()
    new_var_names_sum = {var_name: f"sum_{var_name}" for var_name in ds_coarsened_sum.data_vars}
    ds_coarsened_sum = ds_coarsened_sum.rename(new_var_names_sum)

    # Std
    ds_coarsened_std = ds_interp.coarsen(dim=coarsen_factor, boundary='trim').std()
    new_var_names_std = {var_name: f"std_{var_name}" for var_name in ds_coarsened_std.data_vars}
    ds_coarsened_std = ds_coarsened_std.rename(new_var_names_std)

    # Min
    ds_coarsened_min = ds_interp.coarsen(dim=coarsen_factor, boundary='trim').min()
    new_var_names_min = {var_name: f"min_{var_name}" for var_name in ds_coarsened_min.data_vars}
    ds_coarsened_min = ds_coarsened_min.rename(new_var_names_min)

    # Max
    ds_coarsened_max = ds_interp.coarsen(dim=coarsen_factor, boundary='trim').max()
    new_var_names_max = {var_name: f"max_{var_name}" for var_name in ds_coarsened_max.data_vars}
    ds_coarsened_max = ds_coarsened_max.rename(new_var_names_max)
    
    ds_coarsened = xr.merge([ds_coarsened_avg, ds_coarsened_sum, ds_coarsened_std, ds_coarsened_min, ds_coarsened_max])

    return ds_coarsened

def get_max_date_SHSR_in_hr_range_and_quarter_degree_range_result(ds_interp):
    
    new_longitude = np.around(ds_interp["longitude"] / 0.25) * 0.25
    new_latitude = np.around(ds_interp["latitude"] / 0.25) * 0.25

    ds_interp = ds_interp.interp(
        latitude=new_latitude, longitude=new_longitude, method="linear"
    )

    ds_interp["longitude"] = np.around(ds_interp["longitude"] / 0.25) * 0.25
    ds_interp["latitude"] = np.around(ds_interp["latitude"] / 0.25) * 0.25
    
    return ds_interp


#### 4. Merge the NLDN and SHSR data, prepare for training set with 3 different rules:
1. data with positive lightning event and positive SHSR value in 0.25 degree & 1hr time window:
    getLightning_positive_lightning_shsr_positive(df)
2. data with positive SHSR value but non lightning event in 0.25 degree & 1hr time window:
    getLightning_positive_shsr_0_lightning(df)
3. data with positive lightning event and Non SHSR value in 0.25 degree & 1hr time window:
    getLightning_positive_df_0_shsr(df)

In [7]:
def merge_nldn_and_SHSR_ds(year:int, start_month:int, end_month:int, expend_ratio:int):
    """
        Only consider the NLDN and SHSR data has intersection on the same location.
    
    """
    ds_shsr= get_max_date_SHSR_in_hr_range_and_quarter_degree_range(year, start_month, end_month, expend_ratio)
    ds_shsr_hr_large = get_max_date_SHSR_in_hr_range_and_quarter_degree_range_result(ds_shsr)
    start_date = str(year)+"-"+str(start_month)+"-01"
    if end_month in (4,6,9,11):
        endDay = 30
    elif end_month == 2 and year%4==0:
        endDay = 29
    elif end_month ==2 and year%4!=0:
        endDay = 28
    else:
        endDay = 31
    end_date = str(year)+"-"+str(start_month)+"-"+str(endDay)
    nldn_ds = get_lightning_sum_per_day(start_date, end_date)
    shsr_ds_unique = ds_shsr_hr_large.drop_duplicates(dim=["latitude", "longitude",'time'])
    common_latitudes = xr.DataArray(list(set(nldn_ds.latitude.values) & set(shsr_ds_unique.latitude.values)), dims=["latitude"])
    common_longitudes = xr.DataArray(list(set(nldn_ds.longitude.values) & set(shsr_ds_unique.longitude.values)), dims=["longitude"])
    shsr_filtered = shsr_ds_unique.sel(latitude=common_latitudes, longitude=common_longitudes, method='nearest')
    merged_ds = xr.merge([shsr_filtered, nldn_ds])
    return merged_ds
    
    
def getLightning_positive_lightning_shsr_positive(df):
    df_lightning_positive = df[df['lightning'] > 0]
    df_return = df_lightning_positive[df_lightning_positive["avg_SHSR_mean"]>0]
    return df_return

# def getLightning_positive_df_0_shsr(df):
#     df_lightning_positive = df[df['lightning'] > 0]
#     df_lightning_positive['avg_SHSR_mean'] = df_lightning_positive['avg_SHSR_mean'].fillna(value=0) 
#     df_return = df_lightning_positive.dropna(subset=['avg_SHSR_mean'])
#     return df_return    

def getLightning_positive_shsr_0_lightning(df):
    df_shsr_positive = df[df['avg_SHSR_mean'] > 0]
    df_shsr_positive['lightning'] = df_shsr_positive['lightning'].fillna(value=0) 
    df_return = df_shsr_positive.dropna(subset=['lightning'])
    return df_return    

#### 5. merge the NLDN data and SHSR data together, and generate a dataframe, meanwhile save each day's infomation into csv file


In [28]:
def generate_date_between(start, end):
    """
        start: "yyyy-mm-dd"
        end: "yyyy-mm-dd"
    """
    start_date = datetime.strptime(start, '%Y-%m-%d')
    end_date = datetime.strptime(end, '%Y-%m-%d')
    date_list = [(start_date + timedelta(days=x)).strftime('%Y-%m-%d') for x in range((end_date - start_date).days + 1)]
    return date_list

def save_date_csv(single_date, merged_ds):
    """
        single_date: "yyyy-mm-dd"
    """
    
    ds_daily = merged_ds.sel(time=slice(single_date, single_date))
    ds_daily.attrs['date'] = single_date
    df = ds_daily.load().to_dataframe()
    df_lightning1_shsr_1 = getLightning_positive_lightning_shsr_positive(df)
    csv_file_path = 'total_merged_df_positive_' + single_date+ '.csv'
    df_lightning1_shsr_1.to_csv(csv_file_path)
    print(single_date, "is completed")
    

def save_total_csv(start_date:str, end_date:str, merged_ds):
    """
        start_date: "yyyy-mm-dd"
        end_date: "yyyy-mm-dd"
    """
    date_list = generate_date_between(start_date, end_date)
    for date in date_list:
        save_date_csv(date, merged_ds)
    


def read_csv_to_df(start_date:str, end_date:str, file_path_pre):
    """
        start_date: "yyyy-mm-dd"
        end_date: "yyyy-mm-dd"
    """
    datelist = generate_date_between(start_date, end_date)
    file_path_prefix = file_path_pre
    df_list = []
    for date in datelist:
        # if date == "2021-07-31":
        #     continue
        filename = "total_merged_df_positive_"+date+".csv"
        path = file_path_prefix+filename
        print(path)
        df = pd.read_csv(path)
        df_list.append(df)
    merged_df = pd.concat(df_list, axis=0)
    merged_df['time'] = pd.to_datetime(merged_df['time'])
    return merged_df

def save_date_csv_negative(single_date, merged_ds):
    """
        single_date: "yyyy-mm-dd"
    """
    
    ds_daily = merged_ds.sel(time=slice(single_date, single_date))
    ds_daily.attrs['date'] = single_date
    df = ds_daily.load().to_dataframe()
    df_lightning1_shsr_1 = getLightning_positive_shsr_0_lightning(df)
    
    csv_file_path = 'negative_total_merged_df_' + single_date+ '.csv'
    df_lightning1_shsr_1.to_csv(csv_file_path)
    print(single_date, "is completed")

def save_total_csv_negative(start_date:str, end_date:str, merged_ds):
    """
        start_date: "yyyy-mm-dd"
        end_date: "yyyy-mm-dd"
    """
    date_list = generate_date_between(start_date, end_date)
    for date in date_list:
        save_date_csv_negative(date, merged_ds)

def read_csv_to_df_negative(start_date:str, end_date:str, file_path_pre):
    """
        start_date: "yyyy-mm-dd"
        end_date: "yyyy-mm-dd"
    """
    datelist = generate_date_between(start_date, end_date)
    file_path_prefix = file_path_pre
    df_list = []
    for date in datelist:
        filename = "negative_total_merged_df_"+date+".csv"
        path = file_path_prefix+filename
        df = pd.read_csv(path)
        df_list.append(df)
    merged_df = pd.concat(df_list, axis=0)
    merged_df['time'] = pd.to_datetime(merged_df['time'])
    return merged_df 

        

In [None]:
#Get data for 2021-07 to 2021-08
merged_ds_2021_7_8 = merge_nldn_and_SHSR_ds(2021, 7, 8, 13)
save_total_csv("2021-07-01",'2021-08-30',merged_ds_2021_7_8)
save_total_csv_negative("2021-07-01",'2021-08-30',merged_ds_2021_7_8)

#Ge data fro 2022-07 to 2022-08
merged_ds_2022_7_8 = merge_nldn_and_SHSR_ds(2022, 7, 8, 13)
save_total_csv("2022-07-01",'2022-08-30',merged_ds_2022_7_8)
save_total_csv_negative("2022-07-01",'2022-08-30',merged_ds_2022_7_8)

#### 6. Read the weather data and merged with NLDN+SHSR data

In [37]:

# get single date of weather radar dataset
def get_weather_at_year(weatherType:str, year:int)->xr:
    
    """
        weatherType: str("cloud", "wind_v","wind_u","temperature","dewpoint")
        year: int(yyyy)
    """
    if weatherType.lower() == "cloud":
        fileType = "total_cloud_cover_"+str(year)
    elif weatherType.lower() == "temperature":
        fileType = "2m_temperature_"+str(year)
    elif weatherType.lower() == "dewpoint":
        fileType = "2m_dewpoint_temperature_"+str(year)
    elif weatherType.lower() == "sea_pressure":
        fileType = "mean_sea_level_pressure_"+str(year)
    elif weatherType.lower() == "wind_u":
        fileType = "10m_u_component_of_wind_"+str(year)
    elif weatherType.lower() == "wind_v":   
        fileType = "10m_v_component_of_wind_"+str(year)
        
    fileName =  fileType+".zarr"
    blob_file_name1 = "data/ERA5/"+fileName
    current_ds = read_zarr_from_blob(
        storage_account_name, container_name, blob_file_name1,sas_token
    )
    return current_ds

def get_weather_ds_in_daterange(start_date:str, end_date:str, year:int)->xr:
    """
        start_date: "yyyy-mm-dd"
        end_date: "yyyy-mm-dd"
    """
    cloud_2022_ds = get_weather_at_year("cloud", year)
    wind_2022_v_ds = get_weather_at_year("wind_v",year)
    wind_2022_u_ds = get_weather_at_year("wind_u",year)
    temperature_2022 = get_weather_at_year("temperature",year)
    dewpoint_2022 = get_weather_at_year("dewpoint",year)
    
    cloud_month_ds = cloud_2022_ds.sel(time = slice(start_date, end_date)) #tcc
    wind_month_v = wind_2022_v_ds.sel(time = slice(start_date, end_date)) #v10
    wind_month_u = wind_2022_u_ds.sel(time = slice(start_date, end_date)) #u10
    temperature_month = temperature_2022.sel(time = slice(start_date, end_date)) 
    dewpoint_month = dewpoint_2022.sel(time = slice(start_date, end_date)) #d2m
    merged_ds = xr.merge([cloud_month_ds, wind_month_v, wind_month_u, temperature_month, dewpoint_month])
    
    
    return merged_ds

def get_weather_df(ds:xr) ->pd:
    
    weather_df = ds.to_dataframe()
    weather_df = weather_df[['tcc', 'valid_time', 'v10', 'u10', 't2m','d2m']]
    weather_df.reset_index(inplace=True)
    weather_df.drop_duplicates(inplace=True)

    return weather_df




#### 7. adding more features, previous 4 hours' info, and future 4 hours' info, also static info

In [23]:
def calculate_geographic_local_time(longitude, utc_datetime):
    timezone_offset = timedelta(hours=longitude / 15)
    local_time = utc_datetime + timezone_offset
    return local_time

def floor_to_nearest_hour(dt):
    return dt.replace(minute=0, second=0, microsecond=0)

def get_season(latitude, local_time):
    month = local_time.month
    # Northern Hemisphere 
    if latitude >= 0:  
        if 3 <= month <= 5:
            return 'Spring'
        elif 6 <= month <= 8:
            return 'Summer'
        elif 9 <= month <= 11:
            return 'Autumn'
        else:
            return 'Winter'
    else:  # Southern Hemisphere
        if 3 <= month <= 5:
            return 'Autumn'
        elif 6 <= month <= 8:
            return 'Winter'
        elif 9 <= month <= 11:
            return 'Spring'
        else:
            return 'Summer'
        
def add_static_var(df):

    local_times = []
    local_seasons = []
    for index, _ in df.iterrows():
        time, latitude, longitude = index
        if longitude > 180: longitude -= 360
        if latitude >90: latitude-=180
        local_time = calculate_geographic_local_time(longitude, time)
        local_season = get_season(latitude,time)
        local_times.append(local_time)
        local_seasons.append(local_season)

    # 1. get local time
    df['local_time'] = local_times 
    df['local_time'] = df['local_time'].apply(floor_to_nearest_hour)
    # 2. add day of year
    df['day_of_year'] = df['local_time'].dt.dayofyear
    # 3. add time of day
    df['hour_of_day'] = df['local_time'].apply(lambda x: x.hour)
    # 4. add local season
    df['season'] = local_seasons
    df['isSummer'] = df['season'] == 'Summer'
    df['isSpring'] = df['season'] == 'Spring'
    df['isWinter'] = df['season'] == 'Winter'
    df['isAutumn'] = df['season'] == 'Autumn'
    df.sort_values(by='local_time', inplace=True)

    return df


## deal with weather data
def add_weather_feature(df):
  df['ws'] = np.sqrt(df['u10']**2 + df['v10']**2)
  for var in ['tcc','d2m','t2m','ws']:
    # prev 4 hr
      df[f'{var}_prev_4hr_sum'] = df[var].shift(1).rolling(window=4, min_periods=1).sum()
      df[f'{var}_prev_4hr_std'] = df[var].shift(1).rolling(window=4, min_periods=1).std()
    # after 2hr 
      df[f'{var}_after_2hr_sum'] = df[var].shift(-1)+df[var].shift(-2)
    # after 4hr 
      df[f'{var}_after_4hr_sum'] = df[var].shift(-1)+df[var].shift(-2)+df[var].shift(-3)+df[var].shift(-4)
  return df


def add_SHSR_feature(df, variables, total_variables):
    # For all the variables related to SHSR
    vars = variables + total_variables 
    for var in vars:
        # prev 4 hr
        df[f'{var}_prev_4hr_sum'] = df[var].shift(1).rolling(window=4, min_periods=1).sum()
        # after 2hr 
        df[f'{var}_after_2hr_sum'] = df[var].shift(-1)+df[var].shift(-2)
        # after 4hr 
        df[f'{var}_after_4hr_sum'] = df[var].shift(-1)+df[var].shift(-2)+df[var].shift(-3)+df[var].shift(-4)

    # weather_shsr_var = ['tcc','d2m','t2m','ws'] + total_variables
    for var in total_variables:
        # 创建四小时前到一小时前的列
        for i in range(1, 5):
            df[f'{var}_at-{i}hr'] = df[var].shift(i)

        # 创建一小时后到三小时后的列
        if var == "lightning":
            continue
        for i in range(1, 4):
            df[f'{var}_at+{i}hr'] = df[var].shift(-i)

    df1 = df.fillna(0)
    return df1




In [None]:
def get_final_dataframe(year:int,start_date:str,end_date:str,isPositive:bool,file_path_pre:str):
    if not isPositive:
        shsr_nldn_df = read_csv_to_df_negative(start_date, end_date,file_path_pre)
    else:
        shsr_nldn_df = read_csv_to_df(start_date, end_date,file_path_pre)
    weather_ds  = get_weather_ds_in_daterange(start_date, end_date, year)
    weather_df = get_weather_df(weather_ds)
    shsr_nldn_df.set_index(['time','latitude','longitude'],inplace=True)
    weather_df.set_index(['time','latitude','longitude'],inplace=True)
    model_df = pd.merge(shsr_nldn_df, weather_df, left_index=True, right_index=True, how='inner')
    return model_df

        
def save_modeling_dataframe_to_csv(pre_df, isPositve, year):
    variables = [col for col in pre_df.columns if 'SHSR' in col]
    total_variables = [
        'avg_SHSR_mean', 'avg_SHSR_std', 
        'avg_SHSR_ct', 'avg_SHSR_diff', 'avg_SHSR_above_30_ct', 
        'avg_SHSR_above_40_ct','avg_SHSR_above_50_ct',
        "max_SHSR_mean","max_SHSR_ct","max_SHSR_diff", "min_SHSR_mean",'lightning'
    ]


    model = add_static_var(pre_df)
    model = add_weather_feature(model)
    model = add_SHSR_feature(model,variables, total_variables)
    model.dropna(inplace=model)
    if isPositve:
        fileName = str(year)+"_positive_total.csv"
    else:
        fileName = str(year)+"_negative_total.csv"
    model.to_csv(fileName)

In [None]:
# create dataframe before adding features in different time-windows
model_df_2021_positive = get_final_dataframe(2021,"2021-07-01", "2021-08-30",True,file_path_pre)
model_df_2022_positve = get_final_dataframe(2022,"2022-07-01", "2022-08-30",True,file_path_pre)
model_df_2021_negative = get_final_dataframe(2021,"2021-07-01", "2021-08-30",False,file_path_pre)
model_df_2022_negative  = get_final_dataframe(2022,"2022-07-01", "2022-08-30",False,file_path_pre)

# save the final modeling dataframe with 305 columns
save_modeling_dataframe_to_csv(model_df_2021_positive, True, 2021)
save_modeling_dataframe_to_csv(model_df_2022_positve, True, 2022)
save_modeling_dataframe_to_csv(model_df_2021_negative, False, 2021)
save_modeling_dataframe_to_csv(model_df_2022_negative, False, 2022)