In [1]:
import numpy as np
import pandas as pd

from utils.data import *

In [26]:
df = pd.read_csv('../data/xray.csv')
df

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
0,1996-07-31,-,B1.71,01:32,01:36,01:38
1,1996-07-31,-,B1.7,02:22,02:25,02:27
2,1996-07-31,-,B1.6,07:00,07:03,07:05
3,1996-07-31,7981,B2.7,08:29,08:35,08:40
4,1996-07-31,-,B1.96,11:14,11:17,11:24
...,...,...,...,...,...,...
48859,2024-06-30,13731,C2,04:28,04:34,04:39
48860,2024-06-30,13734,C3.8,05:48,06:18,06:39
48861,2024-06-30,13727,C2.8,20:56,21:03,21:10
48862,2024-06-30,13729,C2.3,22:19,22:28,22:37


In [3]:
df['Region'].unique()

array(['-', '7981', '7982', ..., 'XX09:32', '13734', '13731'],
      dtype=object)

In [4]:
# non numeric values
df[~df['Region'].str.isnumeric()]['Region'].unique()

array(['-', 'XX13:25', 'XX10:28', ..., 'XX21:14', 'XX01:54', 'XX09:32'],
      dtype=object)

In [27]:
df.loc[df['Region'].str.startswith('XX'), 'Region'] = np.nan  # Replace all XX... region with Nan
df.loc[df['Region'] == '-', 'Region'] = np.nan  # Replace all '-' region with Nan
df['Region'].unique()

array([nan, '7981', '7982', ..., '13730', '13734', '13731'], dtype=object)

In [28]:
time_format_pattern = r'^\d{2}:\d{2}$'

In [29]:
df[~df['Start'].str.match(time_format_pattern)]

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
4519,1999-05-17,8541.0,C2.34,B1:31,13:19,13:20
5166,1999-08-11,,C2.19,B2:14,22:29,23:04
21692,2006-09-08,909.0,B8.14,B1:01,10:58,11:46


In [30]:
df.loc[df['Start'] == 'B1:31', 'Start'] = df['Start'].str.replace('B1', '12')
df.loc[df['Start'] == 'B2:14', 'Start'] = df['Start'].str.replace('B', '2')
df.loc[df['Start'] == 'B1:01', 'Start'] = df['Start'].str.replace('B1', '10')
df[~df['Start'].str.match(time_format_pattern)]

Unnamed: 0,Date,Region,Flux,Start,Maximum,End


In [9]:
print(get_unique_types_of_col(df['Start']))
print(get_unique_types_of_col(df['End']))
print(get_unique_types_of_col(df['Maximum']))

{<class 'str'>}
{<class 'str'>}
{<class 'str'>}


In [31]:
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Convert Start and End to datetime with today's date
df['Start'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Start'])
df['End'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['End'])
df['Maximum'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Maximum'])

# Adjust End time if it is earlier than Start time
df.loc[df['End'] < df['Start'], 'End'] += pd.Timedelta(days=1)
df.loc[df['Maximum'] < df['Start'], 'Maximum'] += pd.Timedelta(days=1)
df

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
0,1996-07-31,,B1.71,1996-07-31 01:32:00,1996-07-31 01:36:00,1996-07-31 01:38:00
1,1996-07-31,,B1.7,1996-07-31 02:22:00,1996-07-31 02:25:00,1996-07-31 02:27:00
2,1996-07-31,,B1.6,1996-07-31 07:00:00,1996-07-31 07:03:00,1996-07-31 07:05:00
3,1996-07-31,7981,B2.7,1996-07-31 08:29:00,1996-07-31 08:35:00,1996-07-31 08:40:00
4,1996-07-31,,B1.96,1996-07-31 11:14:00,1996-07-31 11:17:00,1996-07-31 11:24:00
...,...,...,...,...,...,...
48859,2024-06-30,13731,C2,2024-06-30 04:28:00,2024-06-30 04:34:00,2024-06-30 04:39:00
48860,2024-06-30,13734,C3.8,2024-06-30 05:48:00,2024-06-30 06:18:00,2024-06-30 06:39:00
48861,2024-06-30,13727,C2.8,2024-06-30 20:56:00,2024-06-30 21:03:00,2024-06-30 21:10:00
48862,2024-06-30,13729,C2.3,2024-06-30 22:19:00,2024-06-30 22:28:00,2024-06-30 22:37:00


In [32]:
df[['X-ray class', 'X-ray intensity']] = df['Flux'].str.extract(r'([A-Za-z]+)([0-9.]+)')
df['X-ray intensity'] = df['X-ray intensity'].astype(float)
df.drop(columns=['Flux'], inplace=True)

df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df.drop(columns=['Date'], inplace=True)

df

Unnamed: 0,Region,Start,Maximum,End,X-ray class,X-ray intensity,Year,Month,Day
0,,1996-07-31 01:32:00,1996-07-31 01:36:00,1996-07-31 01:38:00,B,1.71,1996,7,31
1,,1996-07-31 02:22:00,1996-07-31 02:25:00,1996-07-31 02:27:00,B,1.70,1996,7,31
2,,1996-07-31 07:00:00,1996-07-31 07:03:00,1996-07-31 07:05:00,B,1.60,1996,7,31
3,7981,1996-07-31 08:29:00,1996-07-31 08:35:00,1996-07-31 08:40:00,B,2.70,1996,7,31
4,,1996-07-31 11:14:00,1996-07-31 11:17:00,1996-07-31 11:24:00,B,1.96,1996,7,31
...,...,...,...,...,...,...,...,...,...
48859,13731,2024-06-30 04:28:00,2024-06-30 04:34:00,2024-06-30 04:39:00,C,2.00,2024,6,30
48860,13734,2024-06-30 05:48:00,2024-06-30 06:18:00,2024-06-30 06:39:00,C,3.80,2024,6,30
48861,13727,2024-06-30 20:56:00,2024-06-30 21:03:00,2024-06-30 21:10:00,C,2.80,2024,6,30
48862,13729,2024-06-30 22:19:00,2024-06-30 22:28:00,2024-06-30 22:37:00,C,2.30,2024,6,30


In [24]:
max_intense_idx = df.groupby(['Year', 'Month', 'Day',
                              'Region', 'X-ray class', 'X-ray intensity'])['Year'].idxmax()
(df.loc[df.index.difference(max_intense_idx)]).dropna()

Unnamed: 0,Region,Start,Maximum,End,X-ray class,X-ray intensity,Year,Month,Day
367,8026,1997-04-02 08:22:00,1997-04-02 08:35:00,1997-04-02 08:42:00,B,5.01,1997,4,2
629,8071,1997-08-13 18:31:00,1997-08-13 18:39:00,1997-08-13 18:51:00,B,4.71,1997,8,13
676,8076,1997-08-30 15:11:00,1997-08-30 15:16:00,1997-08-30 15:20:00,B,8.16,1997,8,30
3745,8440,1999-01-15 15:34:00,1999-01-15 15:38:00,1999-01-15 15:41:00,C,2.19,1999,1,15
4130,8485,1999-03-12 18:20:00,1999-03-12 18:24:00,1999-03-12 18:30:00,C,1.50,1999,3,12
...,...,...,...,...,...,...,...,...,...
48731,13712,2024-06-16 14:32:00,2024-06-16 14:39:00,2024-06-16 14:49:00,C,3.20,2024,6,16
48741,13712,2024-06-16 20:36:00,2024-06-16 20:44:00,2024-06-16 20:49:00,C,2.30,2024,6,16
48753,13712,2024-06-17 10:42:00,2024-06-17 10:46:00,2024-06-17 10:50:00,M,1.50,2024,6,17
48765,13712,2024-06-18 13:17:00,2024-06-18 13:25:00,2024-06-18 13:30:00,C,3.00,2024,6,18


In [34]:
df = df.drop_duplicates(['Year', 'Month', 'Day', 'Region', 'X-ray class', 'X-ray intensity'])
df.shape

(47863, 9)

# Old data

In [35]:
flare_cols_and_widths = {
  "Data Code": 2, # always 31 for x-ray events
  "Station Code": 3,
  "Year": 2,
  "Month": 2,
  "Day": 2,
  "blank1": 2,
  "Start Time HHmm": 4, # Start time of x-ray event - SEE NOTE 1
  "blank2": 1,
  "End Time HHmm": 4,
  "blank3": 1,
  "Max Time HHmm": 4, # Max time
  "blank4": 1,
  
   # latitude of xray flare if known
  "Region": 6,

  "SXI": 3, # SXI if data are from SXI imagery, blank otherwise
  "blank5": 22,
  "X-ray class": 1,  # X-ray class: C,M,X code - SEE NOTE 2 (Flare Type)
  "blank6": 1,
  "X-ray intensity": 2, # X-ray intensity 10-99 for 1.0-9.9 x xray class
  "blank7": 4,
  "Station": 4, # Station ame abbreviation - "Gxx " for GOES
  "blank8": 1,
  "Integrated flux": 8, # units = J/m**2
  "NOAA Sunspot Group Number": 5, # NOAA/USAF sunspot region number
  "blank9": 1,
  "Central Meridian Passage Date": 8, # YYMM D.???
  "blank10": 1,
  "Total Region Area": 7, # squared arc seconds
  "blank11": 1,
  "Total Intensity": 7  # (units - TBD) from SXI, if available
}

df_old_data = pd.read_fwf("../flares1981-2017.txt", header=None,
                 widths=list(flare_cols_and_widths.values()),
                 names=list(flare_cols_and_widths.keys()))

df_old_data.shape

(71984, 29)

In [36]:
df_old_data.drop(['Data Code', 'Station Code', 'Station', 'Integrated flux',
                  'Central Meridian Passage Date', 'Total Region Area',
                  'Total Intensity', 'SXI'], axis=1, inplace=True)
df_old_data.drop(columns=df_old_data.filter(like='blank').columns.to_list(), inplace=True)

df_old_data['Year'] = df_old_data['Year'].astype(int).apply(lambda x: 1900 + x if x > 50 else 2000 + x)
df_old_data['Month'] = df_old_data['Month'].astype(int)
df_old_data['Day'] = df_old_data['Day'].astype(int)

df_old_data

Unnamed: 0,Year,Month,Day,Start Time HHmm,End Time HHmm,Max Time HHmm,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number
0,1981,1,2,441,446,0444,,C,18.0,
1,1981,1,2,955,1004,1000,,C,18.0,
2,1981,1,3,1025,1036,1030,,C,51.0,
3,1981,1,3,1549,1558,1553,,C,22.0,2857.0
4,1981,1,3,1837,1845,1842,,C,22.0,
...,...,...,...,...,...,...,...,...,...,...
71979,2017,6,26,723,733,0728,,B,13.0,
71980,2017,6,27,729,741,0735,,B,11.0,
71981,2017,6,27,2049,2056,2053,,B,11.0,
71982,2017,6,27,2109,2119,2115,N19W18,B,26.0,12664.0


In [14]:
# df_old_data['Start Time HHmm'] = df_old_data['Start Time HHmm'].astype(str).str.zfill(4)
# df_old_data['Max Time HHmm'] = df_old_data['Max Time HHmm'].astype(str).str.zfill(4)
# df_old_data['End Time HHmm'] = df_old_data['End Time HHmm'].astype(str).str.zfill(4)

In [15]:
get_unique_types_of_col(df_old_data['Max Time HHmm'])

{float, str}

In [37]:
df_old_data['Max Time HHmm'] = (
  df_old_data['Max Time HHmm'].apply(
    lambda x: int(x) if isinstance(x, int) or (isinstance(x, str) and x.isdigit()) else None
  )
)
df_old_data['Max Time HHmm'] = df_old_data.apply(
  lambda row: (row['Start Time HHmm'] + row['End Time HHmm']) // 2 if pd.isnull(row['Max Time HHmm']) else row['Max Time HHmm'], axis=1)
# df_old_data['Max Time HHmm'] = pd.to_numeric(df_old_data['Max Time HHmm'], errors='coerce')
df_old_data['Max Time HHmm'] = df_old_data['Max Time HHmm'].astype(int)
df_old_data['Max Time HHmm'].isnull().sum()

0

In [17]:
get_unique_types_of_col(df_old_data['Max Time HHmm'])

{int}

In [18]:
get_unique_types_of_col(df_old_data['Day'])

{int}

In [38]:
def zfill2(_item):
  return str(_item).zfill(2)

# Returns a function to format the time column to 'YYmmdd HHMM'
def format_time_column(_t):
  def format_row_time(row):  # Formats a single row's time to 'YYmmdd HHMM'
    item = row[f'{_t} Time HHmm']
    h = zfill2(min(item//100, 23))
    m = zfill2(min(item%100, 59))
    return f"{row['Year']}{zfill2(row['Month'])}{zfill2(row['Day'])} {h}{m}"
  return format_row_time

In [39]:
for t in ['End', 'Start', 'Max']:
  df_old_data[f'{t} Time HHmm'] = df_old_data.apply(format_time_column(t), axis=1)
  df_old_data[f'{t} Time'] = pd.to_datetime(df_old_data[f'{t} Time HHmm'], format='%Y%m%d %H%M')

df_old_data.drop(['Start Time HHmm', 'End Time HHmm', 'Max Time HHmm'], axis=1, inplace=True)
df_old_data

Unnamed: 0,Year,Month,Day,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number,End Time,Start Time,Max Time
0,1981,1,2,,C,18.0,,1981-01-02 04:46:00,1981-01-02 04:41:00,1981-01-02 04:44:00
1,1981,1,2,,C,18.0,,1981-01-02 10:04:00,1981-01-02 09:55:00,1981-01-02 10:00:00
2,1981,1,3,,C,51.0,,1981-01-03 10:36:00,1981-01-03 10:25:00,1981-01-03 10:30:00
3,1981,1,3,,C,22.0,2857.0,1981-01-03 15:58:00,1981-01-03 15:49:00,1981-01-03 15:53:00
4,1981,1,3,,C,22.0,,1981-01-03 18:45:00,1981-01-03 18:37:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...,...,...,...
71979,2017,6,26,,B,13.0,,2017-06-26 07:33:00,2017-06-26 07:23:00,2017-06-26 07:28:00
71980,2017,6,27,,B,11.0,,2017-06-27 07:41:00,2017-06-27 07:29:00,2017-06-27 07:35:00
71981,2017,6,27,,B,11.0,,2017-06-27 20:56:00,2017-06-27 20:49:00,2017-06-27 20:53:00
71982,2017,6,27,N19W18,B,26.0,12664.0,2017-06-27 21:19:00,2017-06-27 21:09:00,2017-06-27 21:15:00


In [40]:
df_old_data['X-ray intensity'] = df_old_data['X-ray intensity'].apply(
  lambda x: x / 10 if pd.notna(x) else x)

In [41]:
df_old_data.drop(['Region'], axis=1, inplace=True)
df_old_data = df_old_data.rename(columns={'NOAA Sunspot Group Number': 'Region',
                                          'Start Time': 'Start',
                                          'End Time': 'End',
                                          'Max Time': 'Maximum',})
df_old_data

Unnamed: 0,Year,Month,Day,X-ray class,X-ray intensity,Region,End,Start,Maximum
0,1981,1,2,C,1.8,,1981-01-02 04:46:00,1981-01-02 04:41:00,1981-01-02 04:44:00
1,1981,1,2,C,1.8,,1981-01-02 10:04:00,1981-01-02 09:55:00,1981-01-02 10:00:00
2,1981,1,3,C,5.1,,1981-01-03 10:36:00,1981-01-03 10:25:00,1981-01-03 10:30:00
3,1981,1,3,C,2.2,2857.0,1981-01-03 15:58:00,1981-01-03 15:49:00,1981-01-03 15:53:00
4,1981,1,3,C,2.2,,1981-01-03 18:45:00,1981-01-03 18:37:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...,...,...
71979,2017,6,26,B,1.3,,2017-06-26 07:33:00,2017-06-26 07:23:00,2017-06-26 07:28:00
71980,2017,6,27,B,1.1,,2017-06-27 07:41:00,2017-06-27 07:29:00,2017-06-27 07:35:00
71981,2017,6,27,B,1.1,,2017-06-27 20:56:00,2017-06-27 20:49:00,2017-06-27 20:53:00
71982,2017,6,27,B,2.6,12664.0,2017-06-27 21:19:00,2017-06-27 21:09:00,2017-06-27 21:15:00


In [42]:
updated_columns = ['Year', 'Month', 'Day',
                           'Region', 'X-ray class', 'X-ray intensity',
                           'Start', 'End', 'Maximum']
df_old_data = df_old_data[updated_columns]
df = df[updated_columns]

In [24]:
get_all_unique_types(df)

{'Year': {int},
 'Month': {int},
 'Day': {int},
 'Region': {float, str},
 'X-ray class': {str},
 'X-ray intensity': {float},
 'Start': {pandas._libs.tslibs.timestamps.Timestamp},
 'End': {pandas._libs.tslibs.timestamps.Timestamp},
 'Maximum': {pandas._libs.tslibs.timestamps.Timestamp}}

In [25]:
get_all_unique_types(df_old_data)

{'Year': {int},
 'Month': {int},
 'Day': {int},
 'Region': {float},
 'X-ray class': {float, str},
 'X-ray intensity': {float},
 'Start': {pandas._libs.tslibs.timestamps.Timestamp},
 'End': {pandas._libs.tslibs.timestamps.Timestamp},
 'Maximum': {pandas._libs.tslibs.timestamps.Timestamp}}

In [45]:
df_old_data.shape

(71984, 9)

In [48]:
df_old_data = df_old_data.drop_duplicates(['Year', 'Month', 'Day',
                              'Region', 'X-ray class',
                              'X-ray intensity'], keep='first')
df_old_data.shape

(68163, 9)

In [49]:
# take all from year 2017 to 2023
new_data_after_2017_06 = df[(df['Year'] >= 2017) & (df['Year'] <= 2023)]

# Remove all before July 2017
new_data_after_2017_06 = new_data_after_2017_06[~((new_data_after_2017_06['Year'] == 2017) & (new_data_after_2017_06['Month'] < 7)) ]
new_data_after_2017_06

Unnamed: 0,Year,Month,Day,Region,X-ray class,X-ray intensity,Start,End,Maximum
36050,2017,7,2,12664,B,4.14,2017-07-02 12:23:00,2017-07-02 12:59:00,2017-07-02 12:39:00
36051,2017,7,3,12664,B,4.64,2017-07-03 06:17:00,2017-07-03 06:25:00,2017-07-03 06:21:00
36052,2017,7,3,12664,C,1.10,2017-07-03 06:33:00,2017-07-03 06:39:00,2017-07-03 06:37:00
36053,2017,7,3,12664,B,2.40,2017-07-03 07:06:00,2017-07-03 07:12:00,2017-07-03 07:10:00
36054,2017,7,3,12664,B,1.55,2017-07-03 07:28:00,2017-07-03 07:34:00,2017-07-03 07:31:00
...,...,...,...,...,...,...,...,...,...
46895,2023,12,31,13536,C,5.50,2023-12-31 12:37:00,2023-12-31 13:14:00,2023-12-31 12:55:00
46897,2023,12,31,13536,C,3.80,2023-12-31 18:02:00,2023-12-31 18:28:00,2023-12-31 18:16:00
46898,2023,12,31,13536,C,4.20,2023-12-31 18:28:00,2023-12-31 18:44:00,2023-12-31 18:39:00
46899,2023,12,31,13536,M,1.00,2023-12-31 18:44:00,2023-12-31 19:40:00,2023-12-31 19:12:00


In [50]:
df_merged = pd.concat([df_old_data, new_data_after_2017_06], ignore_index=True)
df_merged

Unnamed: 0,Year,Month,Day,Region,X-ray class,X-ray intensity,Start,End,Maximum
0,1981,1,2,,C,1.8,1981-01-02 04:41:00,1981-01-02 04:46:00,1981-01-02 04:44:00
1,1981,1,3,,C,5.1,1981-01-03 10:25:00,1981-01-03 10:36:00,1981-01-03 10:30:00
2,1981,1,3,2857.0,C,2.2,1981-01-03 15:49:00,1981-01-03 15:58:00,1981-01-03 15:53:00
3,1981,1,3,,C,2.2,1981-01-03 18:37:00,1981-01-03 18:45:00,1981-01-03 18:42:00
4,1981,1,3,,C,1.9,1981-01-03 20:24:00,1981-01-03 20:39:00,1981-01-03 20:27:00
...,...,...,...,...,...,...,...,...,...
78356,2023,12,31,13536,C,5.5,2023-12-31 12:37:00,2023-12-31 13:14:00,2023-12-31 12:55:00
78357,2023,12,31,13536,C,3.8,2023-12-31 18:02:00,2023-12-31 18:28:00,2023-12-31 18:16:00
78358,2023,12,31,13536,C,4.2,2023-12-31 18:28:00,2023-12-31 18:44:00,2023-12-31 18:39:00
78359,2023,12,31,13536,M,1.0,2023-12-31 18:44:00,2023-12-31 19:40:00,2023-12-31 19:12:00


# Remove 1981 (because it's also removed from sunspot)

In [51]:
df_merged = df_merged[df_merged['Year'] != 1981]

In [52]:
df_merged.shape

(74580, 9)

In [53]:
df_merged.to_csv('../data/flares-processed.csv', index=False)