In [1]:
import numpy as np
import pandas as pd
import re

In [68]:
df = pd.read_csv('../data/dayobs.csv')
df

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Sunspot Classification,Location
0,1996-06-04,7968,3,0,region_mag A,AXX,N02E41
1,1996-06-05,7968,2,10,region_mag B,BXO,N02E28
2,1996-06-05,7969,1,0,region_mag A,AXX,N09W33
3,1996-06-06,7968,4,20,region_mag B,CRO,N03E12
4,1996-06-06,7969,4,10,region_mag B,BXO,N08W44
...,...,...,...,...,...,...,...
39856,2024-06-30,13731,1,10,region_mag A,AXX,S16E21
39857,2024-06-30,13732,4,30,region_mag B,CAO,S18W14
39858,2024-06-30,13733,4,40,region_mag B,DAO,N05E13
39859,2024-06-30,13734,4,60,region_mag B,DAO,N07E53


In [69]:
df['Magnetic Classification'].unique()

array(['region_mag A', 'region_mag B', 'region_mag BGD', 'region_mag BG',
       'region_mag BD', 'region_mag GD', 'region_mag G', 'region_mag'],
      dtype=object)

In [70]:
def update_magnetic_classification(_old):
  return _old.replace('region_mag ', '').replace('region_mag', '')

df['Magnetic Classification'] = df['Magnetic Classification'].apply(update_magnetic_classification)
df['Magnetic Classification'].unique()

array(['A', 'B', 'BGD', 'BG', 'BD', 'GD', 'G', ''], dtype=object)

In [71]:
df['Date'] = pd.to_datetime(df['Date'])
df['Date'].dt.year.unique()

array([1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
       2018, 2019, 2020, 2021, 2022, 2023, 2024], dtype=int32)

In [72]:
df['Zurich Class'] = df['Sunspot Classification'].str[0]
df['Penumbra Class'] = df['Sunspot Classification'].str[1]
df['Compactness Class'] = df['Sunspot Classification'].str[2]
df.drop(['Sunspot Classification'], axis=1, inplace=True)
df

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
0,1996-06-04,7968,3,0,A,N02E41,A,X,X
1,1996-06-05,7968,2,10,B,N02E28,B,X,O
2,1996-06-05,7969,1,0,A,N09W33,A,X,X
3,1996-06-06,7968,4,20,B,N03E12,C,R,O
4,1996-06-06,7969,4,10,B,N08W44,B,X,O
...,...,...,...,...,...,...,...,...,...
39856,2024-06-30,13731,1,10,A,S16E21,A,X,X
39857,2024-06-30,13732,4,30,B,S18W14,C,A,O
39858,2024-06-30,13733,4,40,B,N05E13,D,A,O
39859,2024-06-30,13734,4,60,B,N07E53,D,A,O


In [73]:
df['Zurich Class'].unique()

array(['A', 'B', 'C', 'D', 'H', 'E', 'F'], dtype=object)

In [74]:
df['Penumbra Class'].unique()

array(['X', 'R', 'S', 'A', 'H', 'K'], dtype=object)

In [75]:
df['Compactness Class'].unique()

array(['X', 'O', 'I', 'C'], dtype=object)

## NOAA dataset

In [49]:
sunspot_cols_and_widths = {
  "Data Code": 2,
  "Year": 2,
  "Month": 2,
  "Day": 2,
  "blank1": 1,
  "Universal Time of Observation": 4,
  "blank2": 1,
  "Location": 6,
  "Mt Wilson Magnetic Classification": 4,
  "blank3": 1,
  "Max Magnetic Field Strength": 1,
  "blank4": 1,
  "Mount Wilson Spot Group Number": 5,
  "Subscript": 1,
  "NOAA Sunspot Group Number": 5, # NOAA/USAF sunspot group number
  "Subscript NOAA": 1,  # This field is used by WDC  to subscript the NOAA/USAF sunspot number, for example 4013, 4013A.
  "Zurich Class": 1,  # USAF only
  "Penumbra Class": 1,  # USAF only
  "Compactness Class": 1, # USAF only
  "blank5": 1,
  "Number of Spots": 2, # USAF only
  "blank6": 1,
  "Longitudinal Extent": 2, # In degrees | USAF only
  "Area in millionths of solar hemisphere": 4, # USAF only
  "blank7": 1,
  "Individual CMP Year": 2,
  "Individual CMP Month": 2,
  "Individual CMP Day": 4,
  "blank8": 1,
  "Regional CMP Year": 2,
  "Regional CMP Month": 2,
  "Regional CMP Day": 4,
  "blank9": 1,
  "Station Serial Number": 3,
  "blank10": 1,
  "Quality": 1, # 1=very poor, 2=poor, 3=fair, 4=good, 5=excellent
  "Station": 4, # 4-letter station abbreviation
}

In [79]:
df_old_data = pd.read_fwf("../spots1981-2017.txt", header=None,
                 widths=list(sunspot_cols_and_widths.values()),
                 names=list(sunspot_cols_and_widths.keys()))

In [80]:
df_old_data.drop(columns=df_old_data.filter(like='blank').columns, inplace=True)
df_old_data.drop(['Data Code', 'Max Magnetic Field Strength', 'Subscript',
                  'Individual CMP Year', 'Individual CMP Month', 'Individual CMP Day',
                  'Regional CMP Year', 'Regional CMP Month', 'Regional CMP Day',
                  'Station Serial Number', 'Station', 'Universal Time of Observation',
                  'Mount Wilson Spot Group Number', 'Subscript NOAA',
                  'Quality', 'Longitudinal Extent'], axis=1, inplace=True)
df_old_data

Unnamed: 0,Year,Month,Day,Location,Mt Wilson Magnetic Classification,NOAA Sunspot Group Number,Zurich Class,Penumbra Class,Compactness Class,Number of Spots,Area in millionths of solar hemisphere
0,81,12,1,N13W46,BG,3478,E,K,I,32.0,600.0
1,81,12,1,N13W46,,3478,E,H,I,16.0,530.0
2,81,12,1,N12W47,BG,3478,E,H,O,22.0,710.0
3,81,12,1,N15W56,BG,3478,E,H,I,31.0,670.0
4,81,12,1,N13W55,BG,3478,E,H,I,17.0,430.0
...,...,...,...,...,...,...,...,...,...,...,...
271878,17,6,29,N18W35,B,12664,C,S,O,2.0,40.0
271879,17,6,29,N19W42,B,12664,C,S,O,3.0,60.0
271880,17,6,30,N15W45,B,12664,C,S,O,2.0,60.0
271881,17,6,30,N18W47,A,12664,H,S,X,1.0,30.0


In [81]:
df_old_data['Mt Wilson Magnetic Classification'].unique()

array(['BG', nan, '( B)', 'B', 'A', '(AP)', '(AF)', 'AP', '(BG)', '( D)',
       'BGD', '(BP)', 'AP)', 'AF', '(BF)', 'X', 'BD', 'BP', 'D', 'GD',
       '(B+)', '( S)', '( F)', '(FG)', 'G', '( X)', '(BA)', '( 3)',
       '( P)', 'BF', 'S', '( G)', '(B )', '(D )', '(AP', 'B)', '( B',
       'B )', '(B', 'BG)', '(BP', '(D)', '(B)', '(BG', 'D *', '(BD)',
       '(D)*', 'AF)', 'BF+', '((B', '(BF', 'B+', 'D*)', 'D*', '(DB)',
       'D %', 'D )', '(AF', '(X )', '(G )', '(BTG', '(D*)', 'BF)', '(D *',
       '(X', 'BP+', '(B +', 'X1)', 'D8)', 'B +', '(A )', 'X )', '(A',
       '(BP+', 'AP?', 'BF2+', '(R )', 'B2', '(B1)', '(B2)', '(D2)',
       '(D+)', 'BG*', 'D #', 'B('], dtype=object)

In [82]:
def remove_special_characters(input_string):
  if not pd.isna(input_string):
    pattern = r'[^ABGD]'  # Not A B G or D
    result_string = re.sub(pattern, '', input_string)
    if result_string:
      return result_string
  return np.nan

In [83]:
# df_old_sunspot['Mt Wilson Magnetic Classification'].apply(remove_special_characters)
df_old_data['Mt Wilson Magnetic Classification'] = df_old_data['Mt Wilson Magnetic Classification'].apply(remove_special_characters)
df_old_data['Mt Wilson Magnetic Classification'].unique()

array(['BG', nan, 'B', 'A', 'D', 'BGD', 'BD', 'GD', 'G', 'BA', 'DB'],
      dtype=object)

In [84]:
df_old_data['Year'] = df_old_data['Year'].astype(int).apply(lambda x: 1900 + x if x > 50 else 2000 + x)
df_old_data['Month'] = df_old_data['Month'].astype(int)
df_old_data['Day'] = df_old_data['Day'].astype(int)
df_old_data['Date'] = pd.to_datetime(df_old_data[['Year', 'Month', 'Day']])
df_old_data.drop(['Year', 'Month', 'Day'], axis=1, inplace=True)

# Move the 'Date' column to the first position
cols = ['Date'] + [col for col in df_old_data.columns if col != 'Date']
df_old_data = df_old_data[cols]
df_old_data

Unnamed: 0,Date,Location,Mt Wilson Magnetic Classification,NOAA Sunspot Group Number,Zurich Class,Penumbra Class,Compactness Class,Number of Spots,Area in millionths of solar hemisphere
0,1981-12-01,N13W46,BG,3478,E,K,I,32.0,600.0
1,1981-12-01,N13W46,,3478,E,H,I,16.0,530.0
2,1981-12-01,N12W47,BG,3478,E,H,O,22.0,710.0
3,1981-12-01,N15W56,BG,3478,E,H,I,31.0,670.0
4,1981-12-01,N13W55,BG,3478,E,H,I,17.0,430.0
...,...,...,...,...,...,...,...,...,...
271878,2017-06-29,N18W35,B,12664,C,S,O,2.0,40.0
271879,2017-06-29,N19W42,B,12664,C,S,O,3.0,60.0
271880,2017-06-30,N15W45,B,12664,C,S,O,2.0,60.0
271881,2017-06-30,N18W47,A,12664,H,S,X,1.0,30.0


In [85]:
# df_old_data['Concatenated'] = df_old_data['Column1'].astype(str) + df_old_data['Column2'].astype(str) + df_old_data['Column3'].astype(str)


In [86]:
df_old_data['Zurich Class'].unique()

array(['E', nan, 'C', 'B', 'A', 'D', 'H', 'F', 'X', '?'], dtype=object)

In [87]:
df_old_data['Penumbra Class'].unique()

array(['K', 'H', 'S', nan, 'A', 'X', 'R', 'O'], dtype=object)

In [88]:
df_old_data['Compactness Class'].unique()

array(['I', 'O', nan, 'X', 'C', '?', '0'], dtype=object)

In [92]:
df_old_data[df_old_data['Compactness Class'] == '0'][['Date', 'Zurich Class', 'Penumbra Class', 'Compactness Class']]

Unnamed: 0,Date,Zurich Class,Penumbra Class,Compactness Class
159667,1995-09-27,B,X,0


In [95]:
def mcintosh_class_fix(input_string):
  if input_string == '0':
    return 'O'
  elif input_string == '?':
    return np.nan
  else:
    return input_string

In [96]:
df_old_data['Zurich Class'] = df_old_data['Zurich Class'].apply(mcintosh_class_fix)
df_old_data['Penumbra Class'] = df_old_data['Penumbra Class'].apply(mcintosh_class_fix)
df_old_data['Compactness Class'] = df_old_data['Compactness Class'].apply(mcintosh_class_fix)
df_old_data['Compactness Class'].unique()

array(['I', 'O', nan, 'X', 'C'], dtype=object)

In [98]:
df.columns

Index(['Date', 'Region', 'Sunspot Number', 'Size', 'Magnetic Classification',
       'Location', 'Zurich Class', 'Penumbra Class', 'Compactness Class'],
      dtype='object')

In [117]:
df_old_data.columns

Index(['Date', 'Location', 'Magnetic Classification', 'Region', 'Zurich Class',
       'Penumbra Class', 'Compactness Class', 'Sunspot Number', 'Size'],
      dtype='object')

### Items need to be cross-checked:
| New Data | Old Data                               |
|----------|----------------------------------------|
| Region   | NOAA Sunspot Group Number              |
| Size     | Area in millionths of solar hemisphere |

### Columns name update:
| New Data                    | Old Data                               |
|-----------------------------|----------------------------------------|
| [x] Region                  | NOAA Sunspot Group Number              |
| [x] Size                    | Area in millionths of solar hemisphere |
| [x] Sunspot Number          | Number of Spots                        |
| [x] Magnetic Classification | Mt Wilson Magnetic Classification      |

In [99]:
df_old_data.rename(columns={
  'NOAA Sunspot Group Number': 'Region',
  'Area in millionths of solar hemisphere': 'Size',
  'Number of Spots': 'Sunspot Number',
  'Mt Wilson Magnetic Classification': 'Magnetic Classification',
}, inplace=True)
df_old_data

Unnamed: 0,Date,Location,Magnetic Classification,Region,Zurich Class,Penumbra Class,Compactness Class,Sunspot Number,Size
0,1981-12-01,N13W46,BG,3478,E,K,I,32.0,600.0
1,1981-12-01,N13W46,,3478,E,H,I,16.0,530.0
2,1981-12-01,N12W47,BG,3478,E,H,O,22.0,710.0
3,1981-12-01,N15W56,BG,3478,E,H,I,31.0,670.0
4,1981-12-01,N13W55,BG,3478,E,H,I,17.0,430.0
...,...,...,...,...,...,...,...,...,...
271878,2017-06-29,N18W35,B,12664,C,S,O,2.0,40.0
271879,2017-06-29,N19W42,B,12664,C,S,O,3.0,60.0
271880,2017-06-30,N15W45,B,12664,C,S,O,2.0,60.0
271881,2017-06-30,N18W47,A,12664,H,S,X,1.0,30.0


In [118]:
df_old_data.shape

(271883, 9)

In [119]:
df_old_data.dropna(subset=['Sunspot Number'], inplace=True)
df_old_data.shape

(235038, 9)

# Cross checking

In [125]:
df_old_data = df_old_data[df.columns]

In [106]:
df['Date'].dt.year.describe()[['min', 'max']]

min    1996.0
max    2024.0
Name: Date, dtype: float64

In [105]:
df_old_data['Date'].dt.year.describe()[['min', 'max']]

min    1981.0
max    2017.0
Name: Date, dtype: float64

In [130]:
df[df['Date'] == pd.to_datetime('06-04-1996')]

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
0,1996-06-04,7968,3,0,A,N02E41,A,X,X


In [131]:
df_old_data[df_old_data['Date'] == pd.to_datetime('06-04-1996')]

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
160885,1996-06-04,7969,1.0,,A,N09W28,A,X,
160900,1996-06-04,7968,2.0,10.0,B,N02E40,C,S,O
160901,1996-06-04,7968,3.0,10.0,B,N02E37,C,R,O
160903,1996-06-04,7968,2.0,10.0,B,N03E32,B,X,O
160904,1996-06-04,7968,3.0,10.0,B,N03E31,B,X,O
160905,1996-06-04,7968,1.0,10.0,A,N01E30,A,X,


In [127]:
df_old_data[df_old_data['Date'] == pd.to_datetime('02-01-2016')]['Date'].dt.day

268887    1
268888    1
268889    1
268890    1
268891    1
268892    1
268893    1
268894    1
268895    1
Name: Date, dtype: int32

In [132]:
df[df['Date'] == pd.to_datetime('06-04-1996')]['Date'].dt.month

0    6
Name: Date, dtype: int32

In [138]:
df['Size'].unique()

array([   0,   10,   20,   50,   80,   90,   60,   40,   70,  100,   30,
        280,  250,  400,  420,  390,  300,  200,  340,  360,  410,  320,
        290,  220,  140,  130,  120,  170,  210,  880,  790,  830,  690,
        580,  570,  450,  180,  110,  150,  160,  230,  460,  540,  650,
        710,  270,  610,  600,  240,  470,  380,  330,  260,  660, 1000,
        900,  760,  630,  480,  350,  680,  940,  550,  190,  590,  310,
        520,  370,  670,  800,  780,  720,  770,  510,  530,  430,  440,
        620,  500,  560,  490,  740,  890, 1230, 1430, 1460, 1400, 1180,
        970,  640,  730,  870, 1370, 1080,  950,  700,  850, 1220,  980,
        920,  820,  930, 1190, 1280, 1120,  910,  750, 1040, 1050, 1100,
        810,  990, 1030,  840, 1010, 1210,  860, 1060, 1090, 1020, 1140,
       1240, 2140, 1940, 1910, 1880, 1970, 1930, 1450, 1590, 2240, 2440,
       2000, 1700, 1810, 1070, 1110,    1,  960, 1130, 1410, 1420, 1150,
          5,  175, 1350, 1170, 1520, 1750, 1950, 19

# Consider taking NOAA data before 1996 06 04

In [147]:
old_data_before_1996 = df_old_data[df_old_data['Date'] < '1996-06-04']
old_data_before_1996

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
0,1981-12-01,3478,32.0,600.0,BG,N13W46,E,K,I
1,1981-12-01,3478,16.0,530.0,,N13W46,E,H,I
2,1981-12-01,3478,22.0,710.0,BG,N12W47,E,H,O
3,1981-12-01,3478,31.0,670.0,BG,N15W56,E,H,I
4,1981-12-01,3478,17.0,430.0,BG,N13W55,E,H,I
...,...,...,...,...,...,...,...,...,...
160892,1996-06-01,7967,1.0,,A,N36E30,A,X,
160893,1996-06-01,7967,1.0,,A,N33E32,A,X,
160894,1996-06-02,7967,1.0,,A,N36E25,A,X,
160895,1996-06-02,7967,1.0,,A,N37E22,A,X,


In [148]:
old_data_before_1996_aggregated = old_data_before_1996.groupby(['Date', 'Region']).agg({
  'Sunspot Number': 'max',
  'Size': lambda x: x.fillna(0).mean(),
  'Magnetic Classification': 'first',
  'Location': 'first',
  'Zurich Class': 'first',
  'Penumbra Class': 'first',
  'Compactness Class': 'first'
}).reset_index()
old_data_before_1996_aggregated

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
0,1981-12-01,3477,70.0,541.666667,BG,S11E06,F,K,I
1,1981-12-01,3478,32.0,535.000000,BG,N13W46,E,K,I
2,1981-12-01,3480,4.0,26.000000,B,S22W37,C,A,O
3,1981-12-01,3481,8.0,33.333333,B,S15E28,C,R,O
4,1981-12-01,3484,35.0,203.333333,BG,N16W10,D,A,I
...,...,...,...,...,...,...,...,...,...
32453,1996-05-22,7964,1.0,0.000000,A,S37E58,A,X,
32454,1996-05-23,7964,2.0,10.000000,B,S11E66,B,X,O
32455,1996-06-01,7967,1.0,0.000000,A,N35E33,A,X,
32456,1996-06-02,7967,1.0,0.000000,A,N36E25,A,X,


In [150]:
df = pd.concat([old_data_before_1996_aggregated, df], ignore_index=True)
df

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
0,1981-12-01,3477,70.0,541.666667,BG,S11E06,F,K,I
1,1981-12-01,3478,32.0,535.000000,BG,N13W46,E,K,I
2,1981-12-01,3480,4.0,26.000000,B,S22W37,C,A,O
3,1981-12-01,3481,8.0,33.333333,B,S15E28,C,R,O
4,1981-12-01,3484,35.0,203.333333,BG,N16W10,D,A,I
...,...,...,...,...,...,...,...,...,...
72314,2024-06-30,13731,1.0,10.000000,A,S16E21,A,X,X
72315,2024-06-30,13732,4.0,30.000000,B,S18W14,C,A,O
72316,2024-06-30,13733,4.0,40.000000,B,N05E13,D,A,O
72317,2024-06-30,13734,4.0,60.000000,B,N07E53,D,A,O


In [151]:
df['Sunspot Number'] = df['Sunspot Number'].round().astype(int)
df['Size'] = df['Size'].round().astype(int)
df

Unnamed: 0,Date,Region,Sunspot Number,Size,Magnetic Classification,Location,Zurich Class,Penumbra Class,Compactness Class
0,1981-12-01,3477,70,542,BG,S11E06,F,K,I
1,1981-12-01,3478,32,535,BG,N13W46,E,K,I
2,1981-12-01,3480,4,26,B,S22W37,C,A,O
3,1981-12-01,3481,8,33,B,S15E28,C,R,O
4,1981-12-01,3484,35,203,BG,N16W10,D,A,I
...,...,...,...,...,...,...,...,...,...
72314,2024-06-30,13731,1,10,A,S16E21,A,X,X
72315,2024-06-30,13732,4,30,B,S18W14,C,A,O
72316,2024-06-30,13733,4,40,B,N05E13,D,A,O
72317,2024-06-30,13734,4,60,B,N07E53,D,A,O


In [152]:
df.to_csv('../data/sunspot-processed.csv', index=False)