In [1]:
import numpy as np
import pandas as pd

In [2]:
aggregated_harp_data = pd.read_parquet("../harp_data/data/processed/aggregated_high-qual_near-center-70.parquet")
flux_data = pd.read_parquet("../xray_fluxes/data/processed/1m_data.parquet")
flare_data = pd.read_parquet("../flare_data/flare_data.parquet")

In [48]:
print(aggregated_harp_data.shape)
print(flux_data.shape)
print(flare_data.shape)

(571216, 23)
(13847040, 10)
(31281, 17)


In [49]:
flare_data.head()

Unnamed: 0,start time,end time,peak time,flare_class,peak_intensity,noaa_ar_5min,noaa_ar_5s,hg1,hg2,car1,car2,rtheta1,rtheta2,xy1,xy2,solar_p_angle,solar_radius
0,2010-01-01 06:02:00+00:00,2010-01-01 06:13:00+00:00,2010-01-01 06:09:00+00:00,B,1.1e-07,,,,,,,,,,,,
1,2010-01-01 12:00:00+00:00,2010-01-01 12:19:00+00:00,2010-01-01 12:09:00+00:00,B,2.7e-07,0.0,0.0,,,,,,,,,,
2,2010-01-01 12:27:00+00:00,2010-01-01 13:09:00+00:00,2010-01-01 12:43:00+00:00,B,3.3e-07,0.0,0.0,,,,,,,,,,
3,2010-01-01 15:58:00+00:00,2010-01-01 16:31:00+00:00,2010-01-01 16:20:00+00:00,B,2.5e-07,,,,,,,,,,,,
4,2010-01-01 18:20:00+00:00,2010-01-01 18:31:00+00:00,2010-01-01 18:27:00+00:00,B,1.3e-07,,,,,,,,,,,,


In [5]:
print(aggregated_harp_data["T_REC"].dtype)
print(flux_data["time"].dtype)

datetime64[ns, UTC]
datetime64[us, UTC]


In [6]:
aggregated_harp_data.head()

Unnamed: 0,T_REC,USFLUX,MEANGAM,MEANGBT,MEANGBZ,MEANGBH,MEANJZD,TOTUSJZ,MEANALP,MEANJZH,...,MEANPOT,TOTPOT,MEANSHR,SHRGT45,NPIX,SIZE,AREA,NACR,SIZE_ACR,AREA_ACR
0,2010-05-01 00:00:00+00:00,,,,,,,,,,...,,,,,,,,,,
1,2010-05-01 00:12:00+00:00,1.439345e+21,32.159885,137.783195,137.832428,59.383249,0.528793,2065877000000.0,0.001307,-0.000677,...,2969.938683,9.237042e+21,24.980931,7.166919,21385.121136,1908.233367,1124.079598,1636.651525,146.041405,86.29848
2,2010-05-01 00:24:00+00:00,1.488903e+21,32.006564,138.032814,137.932202,58.588763,0.355526,2096785000000.0,0.001291,-0.000588,...,2818.961328,9.199543e+21,24.549219,6.112795,22631.582458,2019.510773,1188.056979,1754.132156,156.528544,92.311965
3,2010-05-01 00:36:00+00:00,1.495132e+21,32.489844,136.413847,136.048735,59.049505,0.317466,2190448000000.0,-0.001002,-0.001128,...,2872.564258,9.535298e+21,25.10304,7.290995,23156.134658,2066.389478,1214.339188,1786.976723,159.464861,93.905701
4,2010-05-01 00:48:00+00:00,1.566065e+21,31.843583,136.656334,136.326496,58.237536,0.342064,2305668000000.0,-0.000914,-0.000872,...,2840.389563,9.85197e+21,24.838363,6.565928,24186.736691,2158.422429,1267.802766,1847.651398,164.884269,97.054212


In [7]:
flux_data.head()

Unnamed: 0,time,satellite_secondary,satellite_primary,flux_secondary,flux_primary,good_data_secondary,good_data_primary,satellite,flux,good_data
0,1998-07-27 00:00:00+00:00,10,,3.042857e-07,,True,,10,3.042857e-07,True
1,1998-07-27 00:01:00+00:00,10,,3e-07,,True,,10,3e-07,True
2,1998-07-27 00:02:00+00:00,10,,2.971429e-07,,True,,10,2.971429e-07,True
3,1998-07-27 00:03:00+00:00,10,,2.985715e-07,,True,,10,2.985715e-07,True
4,1998-07-27 00:04:00+00:00,10,,2.971429e-07,,True,,10,2.971429e-07,True


In [8]:
t = aggregated_harp_data['T_REC'][0]
print((t, t + pd.Timedelta(hours=24)))

(Timestamp('2010-05-01 00:00:00+0000', tz='UTC'), Timestamp('2010-05-02 00:00:00+0000', tz='UTC'))


In [9]:
max_fluxes = []
max_flare_classes = []
max_peak_intensities = []

window_start = t
window_end = t + pd.Timedelta(hours=23, minutes=59)

is_in_window = (flux_data["time"] >= window_start) & (flux_data["time"] <= window_end)
max_flux = flux_data.loc[is_in_window, "flux"].max()
max_fluxes.append(max_flux)

is_in_window = (flare_data["peak time"] >= window_start) & (flare_data["peak time"] <= window_end)
if is_in_window.any():
    flare_classes = flare_data.loc[is_in_window, "flare_class"]
    peak_intensities = flare_data.loc[is_in_window, "peak_intensity"]
    i = peak_intensities.idxmax()
    max_flare_class = flare_classes[i]
    max_peak_intensity = peak_intensities[i]
else:
    max_flare_class = pd.NA
    max_peak_intensity = np.nan
max_flare_classes.append(max_flare_class)
max_peak_intensities.append(max_peak_intensity)

print(max_fluxes)
print(max_flare_classes)
print(max_peak_intensities)

[8.111459464998916e-06]
['C']
[8.099999999999999e-06]


In [10]:
combined_data = pd.merge(aggregated_harp_data, flux_data[["time", "flux"]], how="inner", left_on="T_REC", right_on="time").drop(columns="time")
should_keep = combined_data["T_REC"] + pd.Timedelta(hours=23, minutes=59) <= flux_data["time"].iloc[-1]
combined_data = combined_data[should_keep]
combined_data

Unnamed: 0,T_REC,USFLUX,MEANGAM,MEANGBT,MEANGBZ,MEANGBH,MEANJZD,TOTUSJZ,MEANALP,MEANJZH,...,TOTPOT,MEANSHR,SHRGT45,NPIX,SIZE,AREA,NACR,SIZE_ACR,AREA_ACR,flux
0,2010-05-01 00:00:00+00:00,,,,,,,,,,...,,,,,,,,,,5.615019e-08
1,2010-05-01 00:12:00+00:00,1.439345e+21,32.159885,137.783195,137.832428,59.383249,0.528793,2.065877e+12,0.001307,-0.000677,...,9.237042e+21,24.980931,7.166919,21385.121136,1908.233367,1124.079598,1636.651525,146.041405,86.298480,6.331756e-08
2,2010-05-01 00:24:00+00:00,1.488903e+21,32.006564,138.032814,137.932202,58.588763,0.355526,2.096785e+12,0.001291,-0.000588,...,9.199543e+21,24.549219,6.112795,22631.582458,2019.510773,1188.056979,1754.132156,156.528544,92.311965,6.583243e-08
3,2010-05-01 00:36:00+00:00,1.495132e+21,32.489844,136.413847,136.048735,59.049505,0.317466,2.190448e+12,-0.001002,-0.001128,...,9.535298e+21,25.103040,7.290995,23156.134658,2066.389478,1214.339188,1786.976723,159.464861,93.905701,7.199385e-08
4,2010-05-01 00:48:00+00:00,1.566065e+21,31.843583,136.656334,136.326496,58.237536,0.342064,2.305668e+12,-0.000914,-0.000872,...,9.851970e+21,24.838363,6.565928,24186.736691,2158.422429,1267.802766,1847.651398,164.884269,97.054212,6.824671e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571211,2024-08-21 23:00:00+00:00,5.555699e+22,44.493295,91.386968,94.154896,49.025126,-0.016395,7.615394e+13,0.012944,0.006413,...,1.227597e+24,38.092000,33.078717,311849.108684,28007.656779,17542.208126,41837.122195,3757.457601,2337.351041,4.116458e-06
571212,2024-08-21 23:12:00+00:00,5.544437e+22,44.508059,91.128024,93.791001,48.946077,-0.005535,7.609429e+13,0.012937,0.006417,...,1.226458e+24,38.122575,33.168761,312174.184838,28037.470321,17540.792603,41922.444216,3765.203258,2339.563178,3.103156e-06
571213,2024-08-21 23:24:00+00:00,5.539495e+22,44.507347,91.014640,93.646220,48.916111,-0.031197,7.632120e+13,0.012649,0.006323,...,1.227736e+24,38.082282,33.030732,312734.513848,28088.429327,17559.179701,41981.686953,3770.609223,2339.675770,2.699496e-06
571214,2024-08-21 23:36:00+00:00,5.547667e+22,44.474201,91.104216,93.844954,48.798387,-0.004176,7.698746e+13,0.012982,0.006429,...,1.231906e+24,38.154950,33.173974,313043.922425,28116.981354,17561.064916,42009.908578,3773.246331,2341.689287,2.528340e-06


In [11]:
max_fluxes = []
max_flare_classes = []
max_peak_intensities = []
for t in combined_data["T_REC"].iloc[:100]:
    window_start = t
    window_end = t + pd.Timedelta(hours=23, minutes=59)

    is_in_window = (flux_data["time"] >= window_start) & (flux_data["time"] <= window_end)
    max_flux = flux_data.loc[is_in_window, "flux"].max()
    max_fluxes.append(max_flux)

    is_in_window = (flare_data["peak time"] >= window_start) & (flare_data["peak time"] <= window_end)
    flare_classes = flare_data.loc[is_in_window, "flare_class"]
    peak_intensities = flare_data.loc[is_in_window, "peak_intensity"]
    i = peak_intensities.idxmax()
    max_flare_classes.append(flare_classes[i])
    max_peak_intensities.append(peak_intensities[i])
max_fluxes.extend([np.nan for _ in range(len(combined_data) - len(max_fluxes))])
max_flare_classes.extend([np.nan for _ in range(len(combined_data) - len(max_flare_classes))])
max_peak_intensities.extend([np.nan for _ in range(len(combined_data) - len(max_peak_intensities))])

combined_data["max_flux_next_24h"] = max_fluxes
combined_data["max_flare_class_next_24h"] = max_flare_classes
combined_data["max_peak_intensity_next_24h"] = max_peak_intensities

In [12]:
combined_data.head()

Unnamed: 0,T_REC,USFLUX,MEANGAM,MEANGBT,MEANGBZ,MEANGBH,MEANJZD,TOTUSJZ,MEANALP,MEANJZH,...,NPIX,SIZE,AREA,NACR,SIZE_ACR,AREA_ACR,flux,max_flux_next_24h,max_flare_class_next_24h,max_peak_intensity_next_24h
0,2010-05-01 00:00:00+00:00,,,,,,,,,,...,,,,,,,5.615019e-08,8e-06,C,8e-06
1,2010-05-01 00:12:00+00:00,1.439345e+21,32.159885,137.783195,137.832428,59.383249,0.528793,2065877000000.0,0.001307,-0.000677,...,21385.121136,1908.233367,1124.079598,1636.651525,146.041405,86.29848,6.331756e-08,8e-06,C,8e-06
2,2010-05-01 00:24:00+00:00,1.488903e+21,32.006564,138.032814,137.932202,58.588763,0.355526,2096785000000.0,0.001291,-0.000588,...,22631.582458,2019.510773,1188.056979,1754.132156,156.528544,92.311965,6.583243e-08,8e-06,C,8e-06
3,2010-05-01 00:36:00+00:00,1.495132e+21,32.489844,136.413847,136.048735,59.049505,0.317466,2190448000000.0,-0.001002,-0.001128,...,23156.134658,2066.389478,1214.339188,1786.976723,159.464861,93.905701,7.199385e-08,8e-06,C,8e-06
4,2010-05-01 00:48:00+00:00,1.566065e+21,31.843583,136.656334,136.326496,58.237536,0.342064,2305668000000.0,-0.000914,-0.000872,...,24186.736691,2158.422429,1267.802766,1847.651398,164.884269,97.054212,6.824671e-08,8e-06,C,8e-06


In [2]:
combined_data = pd.read_parquet("../combined_data/combined_data.parquet")

In [5]:
combined_data["max_flare_class_next_24h"].value_counts(normalize=True, dropna=False)

max_flare_class_next_24h
C        0.39227
B        0.21973
<NA>    0.192584
M       0.174328
X       0.021085
A       0.000004
Name: proportion, dtype: Float64