In [11]:
import numpy as np
import pandas as pd

from utils.data import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

In [32]:
df_sunspot = pd.read_csv("data/sunspot-processed-nonull-R.csv")
df_flares = pd.read_csv("data/flares-processed-nonull-R.csv")

In [33]:
df_flares['Date'] = pd.to_datetime(df_flares[['Year', 'Month', 'Day']])

# df_flares['Start'] = pd.to_datetime(df_flares['Start'])
# df_flares['End'] = pd.to_datetime(df_flares['End'])
df_flares['Maximum'] = pd.to_datetime(df_flares['Maximum'])

# df_flares['Start_hour'] = df_flares['Start'].dt.hour
# df_flares['Start_minute'] = df_flares['Start'].dt.minute
df_flares['Maximum_hour'] = df_flares['Maximum'].dt.hour
df_flares['Maximum_minute'] = df_flares['Maximum'].dt.minute

df_flares.drop(columns=['Start', 'End',
                        'Year', 'Month', 'Day'], inplace=True)
df_flares.columns

Index(['Region', 'X-ray class', 'X-ray intensity', 'Maximum', 'Date',
       'Maximum_hour', 'Maximum_minute'],
      dtype='object')

In [34]:
get_null_counts(df_sunspot)

Size                 13624
Zurich Class           121
Penumbra Class         183
Compactness Class    39266
dtype: int64

In [35]:
df_flares['X-ray class'].unique()

array(['M', 'C', 'X', 'B', 'A'], dtype=object)

In [36]:
def convert_log_intensity(_df, isLog=True):
  _df['X-ray class'] = _df['X-ray class'].map({
    'A': 1e-7,
    'B': 1e-6,
    'C': 1e-5,
    'M': 1e-4,
    'X': 1e-3,
  })
  _df['Intensity'] = _df['X-ray class'] * _df['X-ray intensity']
  if isLog:
    # Adding a small constant to avoid log(0)
    _df['Log_intensity'] = np.log10(_df['Intensity'] + 1e-12)
  return _df

In [37]:
df_flares = convert_log_intensity(df_flares)
df_flares.columns

Index(['Region', 'X-ray class', 'X-ray intensity', 'Maximum', 'Date',
       'Maximum_hour', 'Maximum_minute', 'Intensity', 'Log_intensity'],
      dtype='object')

In [38]:
X = df_flares['Date']
y_int = df_flares['Log_intensity']
y_mx_hr = df_flares['Maximum_hour']
y_mx_mn = df_flares['Maximum_minute']
y_mx = df_flares['Maximum']

X_train_int, X_test_int, y_train_int, y_test_int = train_test_split(X, y_int,
                                                                    test_size=0.2, random_state=42)
X_train_mxhr, X_test_mxhr, y_train_mxhr, y_test_mxhr = train_test_split(X, y_mx_hr,
                                                                        test_size=0.2, random_state=42)
X_train_mxmn, X_test_mxmn, y_train_mxmn, y_test_mxmn = train_test_split(X, y_mx_mn,
                                                                        test_size=0.2, random_state=42)
X_train_mx, X_test_mx, y_train_mx, y_test_mx = train_test_split(X, y_mx,
                                                                test_size=0.2, random_state=42)

In [39]:
X_train_int

25285   2004-02-25
22732   2002-07-06
23293   2002-10-21
38997   2021-10-27
41970   2022-12-22
           ...    
11284   1991-02-05
44732   2023-12-06
38158   2021-05-30
860     1982-05-30
15795   1993-12-24
Name: Date, Length: 35987, dtype: datetime64[ns]

In [43]:
model_int_r = RandomForestRegressor()
# model_int_f = RandomForestClassifier()

model_int_r.fit(np.array(X_train_int).reshape(-1,1), y_train_int)
# model_int_f.fit(np.array(X_train_int).reshape(-1,1), y_train_int)

In [45]:
y_pred_int = model_int_r.predict(np.array(X_test_int).reshape(-1, 1))

In [48]:
mean_squared_error(y_test_int, y_pred_int)

0.17486792340719878

In [49]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

In [60]:
minmax_scaler = MinMaxScaler()
std_scaler = StandardScaler()
norm_scaler = Normalizer()

_tmp_flares = df_flares[['Date', 'Log_intensity', 'Maximum_hour', 'Maximum_minute', 'Maximum']]
_tmp_flares['Date'] = _tmp_flares['Date'].apply(lambda x: x.timestamp())
_tmp_flares['Maximum'] = _tmp_flares['Maximum'].apply(lambda x: x.timestamp())
df_norm_minmax = pd.DataFrame(minmax_scaler.fit_transform(_tmp_flares), columns=_tmp_flares.columns)
df_std_minmax = pd.DataFrame(std_scaler.fit_transform(_tmp_flares), columns=_tmp_flares.columns)
df_norm = pd.DataFrame(norm_scaler.fit_transform(_tmp_flares), columns=_tmp_flares.columns)

X1 = df_norm_minmax['Date']
y_int1 = df_norm_minmax['Log_intensity']

X2 = df_std_minmax['Date']
y_int2 = df_std_minmax['Log_intensity']

X3 = df_norm['Date']
y_int3 = df_norm['Log_intensity']

X_tr_int1, X_ts_int1, y_tr_int1, y_ts_int1 = train_test_split(X1, y_int1,
                                                          test_size=0.2, random_state=42)
X_tr_int2, X_ts_int2, y_tr_int2, y_ts_int2 = train_test_split(X2, y_int2,
                                                          test_size=0.2, random_state=42)
X_tr_int3, X_ts_int3, y_tr_int3, y_ts_int3 = train_test_split(X3, y_int3,
                                                          test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _tmp_flares['Date'] = _tmp_flares['Date'].apply(lambda x: x.timestamp())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _tmp_flares['Maximum'] = _tmp_flares['Maximum'].apply(lambda x: x.timestamp())


### StandardScaler has 50% accuracy

In [61]:
model_int_norm_r2 = RandomForestRegressor()
model_int_norm_r2.fit(np.array(X_tr_int2).reshape(-1,1), y_tr_int2)
y_prd_int_norm2 = model_int_norm_r2.predict(np.array(X_ts_int2).reshape(-1, 1))
mean_squared_error(y_ts_int2, y_prd_int_norm2)

0.5091529937896666