In [1]:
import pandas as pd
import numpy as np

from utils.data import get_unique_types_of_col, get_null_counts, get_all_unique_types

In [2]:
df = pd.read_csv('../data/xray.csv')
df

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
0,1996-07-31,-,B1.71,01:32,01:36,01:38
1,1996-07-31,-,B1.7,02:22,02:25,02:27
2,1996-07-31,-,B1.6,07:00,07:03,07:05
3,1996-07-31,7981,B2.7,08:29,08:35,08:40
4,1996-07-31,-,B1.96,11:14,11:17,11:24
...,...,...,...,...,...,...
48859,2024-06-30,13731,C2,04:28,04:34,04:39
48860,2024-06-30,13734,C3.8,05:48,06:18,06:39
48861,2024-06-30,13727,C2.8,20:56,21:03,21:10
48862,2024-06-30,13729,C2.3,22:19,22:28,22:37


In [3]:
df['Region'].unique()

array(['-', '7981', '7982', ..., 'XX09:32', '13734', '13731'],
      dtype=object)

In [4]:
# non numeric values
df[~df['Region'].str.isnumeric()]['Region'].unique()

array(['-', 'XX13:25', 'XX10:28', ..., 'XX21:14', 'XX01:54', 'XX09:32'],
      dtype=object)

In [5]:
df.loc[df['Region'].str.startswith('XX'), 'Region'] = '-'
df[~df['Region'].str.isnumeric()]['Region'].unique()

array(['-'], dtype=object)

In [6]:
time_format_pattern = r'^\d{2}:\d{2}$'

In [7]:
df[~df['Start'].str.match(time_format_pattern)]

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
4519,1999-05-17,8541,C2.34,B1:31,13:19,13:20
5166,1999-08-11,-,C2.19,B2:14,22:29,23:04
21692,2006-09-08,0909,B8.14,B1:01,10:58,11:46


In [8]:
df.loc[df['Start'] == 'B1:31', 'Start'] = df['Start'].str.replace('B1', '12')
df.loc[df['Start'] == 'B2:14', 'Start'] = df['Start'].str.replace('B', '2')
df.loc[df['Start'] == 'B1:01', 'Start'] = df['Start'].str.replace('B1', '10')
df[~df['Start'].str.match(time_format_pattern)]

Unnamed: 0,Date,Region,Flux,Start,Maximum,End


In [9]:
print(get_unique_types_of_col(df['Start']))
print(get_unique_types_of_col(df['End']))
print(get_unique_types_of_col(df['Maximum']))

{<class 'str'>}
{<class 'str'>}
{<class 'str'>}


In [10]:
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Convert Start and End to datetime with today's date
df['Start'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Start'])
df['End'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['End'])
df['Maximum'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Maximum'])

# Adjust End time if it is earlier than Start time
df.loc[df['End'] < df['Start'], 'End'] += pd.Timedelta(days=1)
df.loc[df['Maximum'] < df['Start'], 'Maximum'] += pd.Timedelta(days=1)
df

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
0,1996-07-31,-,B1.71,1996-07-31 01:32:00,1996-07-31 01:36:00,1996-07-31 01:38:00
1,1996-07-31,-,B1.7,1996-07-31 02:22:00,1996-07-31 02:25:00,1996-07-31 02:27:00
2,1996-07-31,-,B1.6,1996-07-31 07:00:00,1996-07-31 07:03:00,1996-07-31 07:05:00
3,1996-07-31,7981,B2.7,1996-07-31 08:29:00,1996-07-31 08:35:00,1996-07-31 08:40:00
4,1996-07-31,-,B1.96,1996-07-31 11:14:00,1996-07-31 11:17:00,1996-07-31 11:24:00
...,...,...,...,...,...,...
48859,2024-06-30,13731,C2,2024-06-30 04:28:00,2024-06-30 04:34:00,2024-06-30 04:39:00
48860,2024-06-30,13734,C3.8,2024-06-30 05:48:00,2024-06-30 06:18:00,2024-06-30 06:39:00
48861,2024-06-30,13727,C2.8,2024-06-30 20:56:00,2024-06-30 21:03:00,2024-06-30 21:10:00
48862,2024-06-30,13729,C2.3,2024-06-30 22:19:00,2024-06-30 22:28:00,2024-06-30 22:37:00


In [11]:
flare_cols_and_widths = {
  "Data Code": 2, # always 31 for x-ray events
  "Station Code": 3,
  "Year": 2,
  "Month": 2,
  "Day": 2,
  "blank1": 2,
  "Start Time HHmm": 4, # Start time of x-ray event - SEE NOTE 1
  "blank2": 1,
  "End Time HHmm": 4,
  "blank3": 1,
  "Max Time HHmm": 4, # Max time
  "blank4": 1,
  
   # latitude of xray flare if known
  "Region": 6,

  "SXI": 3, # SXI if data are from SXI imagery, blank otherwise
  "blank5": 22,
  "X-ray class": 1,  # X-ray class: C,M,X code - SEE NOTE 2 (Flare Type)
  "blank6": 1,
  "X-ray intensity": 2, # X-ray intensity 10-99 for 1.0-9.9 x xray class
  "blank7": 4,
  "Station": 4, # Station ame abbreviation - "Gxx " for GOES
  "blank8": 1,
  "Integrated flux": 8, # units = J/m**2
  "NOAA Sunspot Group Number": 5, # NOAA/USAF sunspot region number
  "blank9": 1,
  "Central Meridian Passage Date": 8, # YYMM D.???
  "blank10": 1,
  "Total Region Area": 7, # squared arc seconds
  "blank11": 1,
  "Total Intensity": 7  # (units - TBD) from SXI, if available
}

df_old_data = pd.read_fwf("../flares1981-2017.txt", header=None,
                 widths=list(flare_cols_and_widths.values()),
                 names=list(flare_cols_and_widths.keys()))

df_old_data.shape

(71984, 29)

In [12]:
df_old_data.drop(['Data Code', 'Station Code', 'Station', 'Integrated flux',
                  'Central Meridian Passage Date', 'Total Region Area',
                  'Total Intensity', 'SXI'], axis=1, inplace=True)
df_old_data.drop(columns=df_old_data.filter(like='blank').columns.to_list(), inplace=True)

df_old_data['Year'] = df_old_data['Year'].astype(int).apply(lambda x: 1900 + x if x > 50 else 2000 + x)
df_old_data['Month'] = df_old_data['Month'].astype(int)
df_old_data['Day'] = df_old_data['Day'].astype(int)
df_old_data['Date'] = pd.to_datetime(df_old_data[['Year', 'Month', 'Day']])

# Move the 'Date' column to the first position
cols = ['Date'] + [col for col in df_old_data.columns if col != 'Date']
df_old_data = df_old_data[cols]
df_old_data

Unnamed: 0,Date,Year,Month,Day,Start Time HHmm,End Time HHmm,Max Time HHmm,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number
0,1981-01-02,1981,1,2,441,446,0444,,C,18.0,
1,1981-01-02,1981,1,2,955,1004,1000,,C,18.0,
2,1981-01-03,1981,1,3,1025,1036,1030,,C,51.0,
3,1981-01-03,1981,1,3,1549,1558,1553,,C,22.0,2857.0
4,1981-01-03,1981,1,3,1837,1845,1842,,C,22.0,
...,...,...,...,...,...,...,...,...,...,...,...
71979,2017-06-26,2017,6,26,723,733,0728,,B,13.0,
71980,2017-06-27,2017,6,27,729,741,0735,,B,11.0,
71981,2017-06-27,2017,6,27,2049,2056,2053,,B,11.0,
71982,2017-06-27,2017,6,27,2109,2119,2115,N19W18,B,26.0,12664.0


In [13]:
df_old_data['Max Time HHmm'].isnull().sum()

68

In [14]:
df_old_data['Max Time HHmm'] = (
  df_old_data['Max Time HHmm'].apply(
    lambda x: x if isinstance(x, int) or (isinstance(x, str) and x.isdigit()) else None
  )
)
df_old_data['Max Time HHmm'] = df_old_data.apply(
  lambda row: (row['Start Time HHmm'] + row['End Time HHmm']) // 2 if pd.isnull(row['Max Time HHmm']) else row['Max Time HHmm'], axis=1)
df_old_data['Max Time HHmm'] = pd.to_numeric(df_old_data['Max Time HHmm'], errors='coerce')
df_old_data['Max Time HHmm'].isnull().sum()

0

In [15]:
get_unique_types_of_col(df_old_data['Max Time HHmm'])

{int}

In [16]:
# Convert to int/float
# df_old_data['X-ray intensity'] = pd.to_numeric(df_old_data['X-ray intensity'], errors='coerce').astype(float).astype('Int64')
df_old_data['NOAA Sunspot Group Number'] = pd.to_numeric(df_old_data['NOAA Sunspot Group Number'], errors='coerce').astype(float).astype('Int64')

In [17]:
# Returns a function to format the time column to 'YYmmdd HHMM'
def format_time_column(_t):
  def format_row_time(row):  # Formats a single row's time to 'YYmmdd HHMM'
    item = row[f'{_t} Time HHmm']
    h = str(min(item//100, 23)).zfill(2)
    m = str(min(item%100, 59)).zfill(2)
    return f"{row['Date'].year}{row['Date'].month}{row['Date'].day} {h}{m}"
  return format_row_time

In [18]:
for t in ['End', 'Start', 'Max']:
  df_old_data[f'{t} Time HHmm'] = df_old_data.apply(format_time_column(t), axis=1)
  df_old_data[f'{t} Time'] = pd.to_datetime(df_old_data[f'{t} Time HHmm'], format='%Y%m%d %H%M')
# df_old_data.drop(['End Time HHmm', 'Start Time HHmm', 'Max Time HHmm', 'Year', 'Month', 'Day'], axis=1, inplace=True)
df_old_data

Unnamed: 0,Date,Year,Month,Day,Start Time HHmm,End Time HHmm,Max Time HHmm,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number,End Time,Start Time,Max Time
0,1981-01-02,1981,1,2,198112 0441,198112 0446,198112 0444,,C,18.0,,1981-01-02 04:46:00,1981-01-02 04:41:00,1981-01-02 04:44:00
1,1981-01-02,1981,1,2,198112 0955,198112 1004,198112 1000,,C,18.0,,1981-01-02 10:04:00,1981-01-02 09:55:00,1981-01-02 10:00:00
2,1981-01-03,1981,1,3,198113 1025,198113 1036,198113 1030,,C,51.0,,1981-01-03 10:36:00,1981-01-03 10:25:00,1981-01-03 10:30:00
3,1981-01-03,1981,1,3,198113 1549,198113 1558,198113 1553,,C,22.0,2857,1981-01-03 15:58:00,1981-01-03 15:49:00,1981-01-03 15:53:00
4,1981-01-03,1981,1,3,198113 1837,198113 1845,198113 1842,,C,22.0,,1981-01-03 18:45:00,1981-01-03 18:37:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71979,2017-06-26,2017,6,26,2017626 0723,2017626 0733,2017626 0728,,B,13.0,,2017-06-26 07:33:00,2017-06-26 07:23:00,2017-06-26 07:28:00
71980,2017-06-27,2017,6,27,2017627 0729,2017627 0741,2017627 0735,,B,11.0,,2017-06-27 07:41:00,2017-06-27 07:29:00,2017-06-27 07:35:00
71981,2017-06-27,2017,6,27,2017627 2049,2017627 2056,2017627 2053,,B,11.0,,2017-06-27 20:56:00,2017-06-27 20:49:00,2017-06-27 20:53:00
71982,2017-06-27,2017,6,27,2017627 2109,2017627 2119,2017627 2115,N19W18,B,26.0,12664,2017-06-27 21:19:00,2017-06-27 21:09:00,2017-06-27 21:15:00


In [19]:
df_old_data.drop(['Start Time HHmm', 'End Time HHmm', 'Max Time HHmm',
                  'Year', 'Month', 'Day'], axis=1, inplace=True)
df_old_data

Unnamed: 0,Date,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number,End Time,Start Time,Max Time
0,1981-01-02,,C,18.0,,1981-01-02 04:46:00,1981-01-02 04:41:00,1981-01-02 04:44:00
1,1981-01-02,,C,18.0,,1981-01-02 10:04:00,1981-01-02 09:55:00,1981-01-02 10:00:00
2,1981-01-03,,C,51.0,,1981-01-03 10:36:00,1981-01-03 10:25:00,1981-01-03 10:30:00
3,1981-01-03,,C,22.0,2857,1981-01-03 15:58:00,1981-01-03 15:49:00,1981-01-03 15:53:00
4,1981-01-03,,C,22.0,,1981-01-03 18:45:00,1981-01-03 18:37:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...,...
71979,2017-06-26,,B,13.0,,2017-06-26 07:33:00,2017-06-26 07:23:00,2017-06-26 07:28:00
71980,2017-06-27,,B,11.0,,2017-06-27 07:41:00,2017-06-27 07:29:00,2017-06-27 07:35:00
71981,2017-06-27,,B,11.0,,2017-06-27 20:56:00,2017-06-27 20:49:00,2017-06-27 20:53:00
71982,2017-06-27,N19W18,B,26.0,12664,2017-06-27 21:19:00,2017-06-27 21:09:00,2017-06-27 21:15:00


In [21]:
_dt = '01-04-2017'

In [150]:
df_old_data[df_old_data['Date'] == pd.to_datetime(_dt)]

Unnamed: 0,Date,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number,End Time,Start Time,Max Time
71474,2017-01-04,,B,18.0,,2017-01-04 02:50:00,2017-01-04 02:34:00,2017-01-04 02:39:00


In [151]:
df[df['Date'] == pd.to_datetime(_dt)]

Unnamed: 0,Date,Region,Flux,Start,Maximum,End
35522,2017-01-04,12624,B1.93,02:34,02:39,02:50


In [22]:
df_old_data['X-ray intensity'] = df_old_data['X-ray intensity'].apply(
  lambda x: x / 10 if pd.notna(x) else x)
df_old_data

Unnamed: 0,Date,Region,X-ray class,X-ray intensity,NOAA Sunspot Group Number,End Time,Start Time,Max Time
0,1981-01-02,,C,1.8,,1981-01-02 04:46:00,1981-01-02 04:41:00,1981-01-02 04:44:00
1,1981-01-02,,C,1.8,,1981-01-02 10:04:00,1981-01-02 09:55:00,1981-01-02 10:00:00
2,1981-01-03,,C,5.1,,1981-01-03 10:36:00,1981-01-03 10:25:00,1981-01-03 10:30:00
3,1981-01-03,,C,2.2,2857,1981-01-03 15:58:00,1981-01-03 15:49:00,1981-01-03 15:53:00
4,1981-01-03,,C,2.2,,1981-01-03 18:45:00,1981-01-03 18:37:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...,...
71979,2017-06-26,,B,1.3,,2017-06-26 07:33:00,2017-06-26 07:23:00,2017-06-26 07:28:00
71980,2017-06-27,,B,1.1,,2017-06-27 07:41:00,2017-06-27 07:29:00,2017-06-27 07:35:00
71981,2017-06-27,,B,1.1,,2017-06-27 20:56:00,2017-06-27 20:49:00,2017-06-27 20:53:00
71982,2017-06-27,N19W18,B,2.6,12664,2017-06-27 21:19:00,2017-06-27 21:09:00,2017-06-27 21:15:00


In [23]:
df_old_data.drop(['Region'], axis=1, inplace=True)
df_old_data = df_old_data.rename(columns={'NOAA Sunspot Group Number': 'Region',
                                          'Start Time': 'Start',
                                          'End Time': 'End',
                                          'Max Time': 'Maximum',})
df_old_data

Unnamed: 0,Date,X-ray class,X-ray intensity,Region,End,Start,Maximum
0,1981-01-02,C,1.8,,1981-01-02 04:46:00,1981-01-02 04:41:00,1981-01-02 04:44:00
1,1981-01-02,C,1.8,,1981-01-02 10:04:00,1981-01-02 09:55:00,1981-01-02 10:00:00
2,1981-01-03,C,5.1,,1981-01-03 10:36:00,1981-01-03 10:25:00,1981-01-03 10:30:00
3,1981-01-03,C,2.2,2857,1981-01-03 15:58:00,1981-01-03 15:49:00,1981-01-03 15:53:00
4,1981-01-03,C,2.2,,1981-01-03 18:45:00,1981-01-03 18:37:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...
71979,2017-06-26,B,1.3,,2017-06-26 07:33:00,2017-06-26 07:23:00,2017-06-26 07:28:00
71980,2017-06-27,B,1.1,,2017-06-27 07:41:00,2017-06-27 07:29:00,2017-06-27 07:35:00
71981,2017-06-27,B,1.1,,2017-06-27 20:56:00,2017-06-27 20:49:00,2017-06-27 20:53:00
71982,2017-06-27,B,2.6,12664,2017-06-27 21:19:00,2017-06-27 21:09:00,2017-06-27 21:15:00


In [24]:
df[['X-ray class', 'X-ray intensity']] = df['Flux'].str.extract(r'([A-Za-z]+)([0-9.]+)')
df['X-ray intensity'] = df['X-ray intensity'].astype(float)
df.drop(columns=['Flux'], inplace=True)
df

Unnamed: 0,Date,Region,Start,Maximum,End,X-ray class,X-ray intensity
0,1996-07-31,-,1996-07-31 01:32:00,1996-07-31 01:36:00,1996-07-31 01:38:00,B,1.71
1,1996-07-31,-,1996-07-31 02:22:00,1996-07-31 02:25:00,1996-07-31 02:27:00,B,1.70
2,1996-07-31,-,1996-07-31 07:00:00,1996-07-31 07:03:00,1996-07-31 07:05:00,B,1.60
3,1996-07-31,7981,1996-07-31 08:29:00,1996-07-31 08:35:00,1996-07-31 08:40:00,B,2.70
4,1996-07-31,-,1996-07-31 11:14:00,1996-07-31 11:17:00,1996-07-31 11:24:00,B,1.96
...,...,...,...,...,...,...,...
48859,2024-06-30,13731,2024-06-30 04:28:00,2024-06-30 04:34:00,2024-06-30 04:39:00,C,2.00
48860,2024-06-30,13734,2024-06-30 05:48:00,2024-06-30 06:18:00,2024-06-30 06:39:00,C,3.80
48861,2024-06-30,13727,2024-06-30 20:56:00,2024-06-30 21:03:00,2024-06-30 21:10:00,C,2.80
48862,2024-06-30,13729,2024-06-30 22:19:00,2024-06-30 22:28:00,2024-06-30 22:37:00,C,2.30


In [25]:
df_old_data = df_old_data[['Date', 'Region', 'X-ray class', 'X-ray intensity', 'Start', 'End', 'Maximum']]
df = df[['Date', 'Region', 'X-ray class', 'X-ray intensity', 'Start', 'End', 'Maximum']]

In [26]:
get_all_unique_types(df)

{'Date': {pandas._libs.tslibs.timestamps.Timestamp},
 'Region': {str},
 'X-ray class': {str},
 'X-ray intensity': {float},
 'Start': {pandas._libs.tslibs.timestamps.Timestamp},
 'End': {pandas._libs.tslibs.timestamps.Timestamp},
 'Maximum': {pandas._libs.tslibs.timestamps.Timestamp}}

In [27]:
get_all_unique_types(df_old_data)

{'Date': {pandas._libs.tslibs.timestamps.Timestamp},
 'Region': {float},
 'X-ray class': {float, str},
 'X-ray intensity': {float},
 'Start': {pandas._libs.tslibs.timestamps.Timestamp},
 'End': {pandas._libs.tslibs.timestamps.Timestamp},
 'Maximum': {pandas._libs.tslibs.timestamps.Timestamp}}

In [31]:
df_old_data['Region'].unique()

<IntegerArray>
[ <NA>,  2857,  2866,  2872,  2875,  2868,  2874,  2880,  2883,  2882,
 ...
 12651, 12653, 12652, 12655, 12658, 12660, 12659, 12661, 12663, 12664]
Length: 4569, dtype: Int64

In [44]:
df_old_data['Region'] = df_old_data['Region'].apply(lambda x: str(int(x)) if pd.notna(x) else '-')
get_unique_types_of_col(df_old_data['Region'])

{str}

In [46]:
old_data_before_1996 = df_old_data[df_old_data['Date'] < '1996-07-31']
# old_data_before_1996['Region'] = old_data_before_1996['Region'].astype('string').fillna('-')
old_data_before_1996

Unnamed: 0,Date,Region,X-ray class,X-ray intensity,Start,End,Maximum
0,1981-01-02,-,C,1.8,1981-01-02 04:41:00,1981-01-02 04:46:00,1981-01-02 04:44:00
1,1981-01-02,-,C,1.8,1981-01-02 09:55:00,1981-01-02 10:04:00,1981-01-02 10:00:00
2,1981-01-03,-,C,5.1,1981-01-03 10:25:00,1981-01-03 10:36:00,1981-01-03 10:30:00
3,1981-01-03,2857,C,2.2,1981-01-03 15:49:00,1981-01-03 15:58:00,1981-01-03 15:53:00
4,1981-01-03,-,C,2.2,1981-01-03 18:37:00,1981-01-03 18:45:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...
35215,1996-07-13,-,B,1.1,1996-07-13 20:45:00,1996-07-13 20:56:00,1996-07-13 20:49:00
35216,1996-07-14,-,B,2.3,1996-07-14 07:10:00,1996-07-14 07:29:00,1996-07-14 07:19:00
35217,1996-07-14,-,B,2.5,1996-07-14 08:31:00,1996-07-14 08:53:00,1996-07-14 08:42:00
35218,1996-07-14,-,B,1.0,1996-07-14 11:56:00,1996-07-14 12:10:00,1996-07-14 11:59:00


In [47]:
df = pd.concat([old_data_before_1996, df], ignore_index=True)
df

Unnamed: 0,Date,Region,X-ray class,X-ray intensity,Start,End,Maximum
0,1981-01-02,-,C,1.8,1981-01-02 04:41:00,1981-01-02 04:46:00,1981-01-02 04:44:00
1,1981-01-02,-,C,1.8,1981-01-02 09:55:00,1981-01-02 10:04:00,1981-01-02 10:00:00
2,1981-01-03,-,C,5.1,1981-01-03 10:25:00,1981-01-03 10:36:00,1981-01-03 10:30:00
3,1981-01-03,2857,C,2.2,1981-01-03 15:49:00,1981-01-03 15:58:00,1981-01-03 15:53:00
4,1981-01-03,-,C,2.2,1981-01-03 18:37:00,1981-01-03 18:45:00,1981-01-03 18:42:00
...,...,...,...,...,...,...,...
84079,2024-06-30,13731,C,2.0,2024-06-30 04:28:00,2024-06-30 04:39:00,2024-06-30 04:34:00
84080,2024-06-30,13734,C,3.8,2024-06-30 05:48:00,2024-06-30 06:39:00,2024-06-30 06:18:00
84081,2024-06-30,13727,C,2.8,2024-06-30 20:56:00,2024-06-30 21:10:00,2024-06-30 21:03:00
84082,2024-06-30,13729,C,2.3,2024-06-30 22:19:00,2024-06-30 22:37:00,2024-06-30 22:28:00


In [48]:
df.to_csv('../data/flares-processed.csv', index=False)