In [1]:
import pandas as pd
import sklearn 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('solar_flare.csv')
df.head()

Unnamed: 0,flare,start.date,start.time,peak,end,duration.s,peak.c/s,total.counts,energy.kev,x.pos.asec,y.pos.asec,radial,active.region.ar,flag.1,flag.2,flag.3,flag.4,flag.5
0,2021213,2002-02-12,21:29:56,21:33:38,21:41:48,712,136,167304.0,12-25,592,-358,692,0,A1,P1,,,
1,2021228,2002-02-12,21:44:08,21:45:06,21:48:56,288,7,9504.0,6-12,604,-341,694,9811,A1,P1,PE,Q1,
2,2021332,2002-02-13,00:53:24,00:54:54,00:57:00,216,15,11448.0,6-12,-310,375,487,9825,A1,P1,,,
3,2021308,2002-02-13,04:22:52,04:23:50,04:26:56,244,20,17400.0,12-25,-277,378,469,9822,A1,P1,,,
4,2021310,2002-02-13,07:03:52,07:05:14,07:07:48,236,336,313392.0,25-50,-272,390,476,9825,A1,GS,P1,PE,Q2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116143 entries, 0 to 116142
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   flare             116143 non-null  int64  
 1   start.date        116143 non-null  object 
 2   start.time        116143 non-null  object 
 3   peak              116143 non-null  object 
 4   end               116143 non-null  object 
 5   duration.s        116143 non-null  int64  
 6   peak.c/s          116143 non-null  int64  
 7   total.counts      116143 non-null  float64
 8   energy.kev        116143 non-null  object 
 9   x.pos.asec        116143 non-null  int64  
 10  y.pos.asec        116143 non-null  int64  
 11  radial            116143 non-null  int64  
 12  active.region.ar  116143 non-null  int64  
 13  flag.1            116143 non-null  object 
 14  flag.2            116143 non-null  object 
 15  flag.3            96236 non-null   object 
 16  flag.4            95

In [4]:
df.isnull().sum()

flare                   0
start.date              0
start.time              0
peak                    0
end                     0
duration.s              0
peak.c/s                0
total.counts            0
energy.kev              0
x.pos.asec              0
y.pos.asec              0
radial                  0
active.region.ar        0
flag.1                  0
flag.2                  0
flag.3              19907
flag.4              20202
flag.5              61180
dtype: int64

In [5]:
df = df.drop(['flare','flag.1','flag.2','flag.3', 'flag.4', 'flag.5'], axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116143 entries, 0 to 116142
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   start.date        116143 non-null  object 
 1   start.time        116143 non-null  object 
 2   peak              116143 non-null  object 
 3   end               116143 non-null  object 
 4   duration.s        116143 non-null  int64  
 5   peak.c/s          116143 non-null  int64  
 6   total.counts      116143 non-null  float64
 7   energy.kev        116143 non-null  object 
 8   x.pos.asec        116143 non-null  int64  
 9   y.pos.asec        116143 non-null  int64  
 10  radial            116143 non-null  int64  
 11  active.region.ar  116143 non-null  int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 10.6+ MB


In [7]:
df['start.time'] = pd.to_datetime(df['start.date'] + ' ' + df['start.time'])
df['peak'] = pd.to_datetime(df['start.date'] + ' ' + df['peak'])
df['end'] = pd.to_datetime(df['start.date'] + ' ' + df['end'])
df.head()

Unnamed: 0,start.date,start.time,peak,end,duration.s,peak.c/s,total.counts,energy.kev,x.pos.asec,y.pos.asec,radial,active.region.ar
0,2002-02-12,2002-02-12 21:29:56,2002-02-12 21:33:38,2002-02-12 21:41:48,712,136,167304.0,12-25,592,-358,692,0
1,2002-02-12,2002-02-12 21:44:08,2002-02-12 21:45:06,2002-02-12 21:48:56,288,7,9504.0,6-12,604,-341,694,9811
2,2002-02-13,2002-02-13 00:53:24,2002-02-13 00:54:54,2002-02-13 00:57:00,216,15,11448.0,6-12,-310,375,487,9825
3,2002-02-13,2002-02-13 04:22:52,2002-02-13 04:23:50,2002-02-13 04:26:56,244,20,17400.0,12-25,-277,378,469,9822
4,2002-02-13,2002-02-13 07:03:52,2002-02-13 07:05:14,2002-02-13 07:07:48,236,336,313392.0,25-50,-272,390,476,9825


In [8]:
df = df.drop('start.date', axis=1)

In [9]:
df = df[df['start.time'] > '2009-12-31 23:59:59']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66565 entries, 49578 to 116142
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   start.time        66565 non-null  datetime64[ns]
 1   peak              66565 non-null  datetime64[ns]
 2   end               66565 non-null  datetime64[ns]
 3   duration.s        66565 non-null  int64         
 4   peak.c/s          66565 non-null  int64         
 5   total.counts      66565 non-null  float64       
 6   energy.kev        66565 non-null  object        
 7   x.pos.asec        66565 non-null  int64         
 8   y.pos.asec        66565 non-null  int64         
 9   radial            66565 non-null  int64         
 10  active.region.ar  66565 non-null  int64         
dtypes: datetime64[ns](3), float64(1), int64(6), object(1)
memory usage: 6.1+ MB


In [10]:
# Split the column and convert to eV
df[['energy_low_ev', 'energy_high_ev']] = df['energy.kev'].str.split('-', expand=True).astype(float) * 1000

# Convert to integer
df['energy_low_ev'] = df['energy_low_ev'].astype(int)
df['energy_high_ev'] = df['energy_high_ev'].astype(int)
df = df.drop('energy.kev', axis=1)

In [11]:
df.head()

Unnamed: 0,start.time,peak,end,duration.s,peak.c/s,total.counts,x.pos.asec,y.pos.asec,radial,active.region.ar,energy_low_ev,energy_high_ev
49578,2010-01-01 09:47:20,2010-01-01 09:48:06,2010-01-01 09:52:48,328,136,123696.0,0,0,0,0,3000,6000
49579,2010-01-01 15:49:04,2010-01-01 15:50:34,2010-01-01 15:50:40,96,88,27360.0,0,0,0,0,3000,6000
49580,2010-01-02 00:19:36,2010-01-02 00:20:50,2010-01-02 00:20:56,80,92,24192.0,0,0,0,0,3000,6000
49581,2010-01-02 07:26:04,2010-01-02 07:26:30,2010-01-02 07:32:32,388,144,154632.0,532,-399,666,1039,6000,12000
49582,2010-01-02 07:32:32,2010-01-02 07:35:50,2010-01-02 07:42:04,572,76,153600.0,518,-402,656,1039,6000,12000


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66565 entries, 49578 to 116142
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   start.time        66565 non-null  datetime64[ns]
 1   peak              66565 non-null  datetime64[ns]
 2   end               66565 non-null  datetime64[ns]
 3   duration.s        66565 non-null  int64         
 4   peak.c/s          66565 non-null  int64         
 5   total.counts      66565 non-null  float64       
 6   x.pos.asec        66565 non-null  int64         
 7   y.pos.asec        66565 non-null  int64         
 8   radial            66565 non-null  int64         
 9   active.region.ar  66565 non-null  int64         
 10  energy_low_ev     66565 non-null  int64         
 11  energy_high_ev    66565 non-null  int64         
dtypes: datetime64[ns](3), float64(1), int64(8)
memory usage: 6.6 MB


In [13]:
scaler = StandardScaler()
scaled_data = df.drop(['start.time','peak','end'], axis=1)
scaled_array = scaler.fit_transform(scaled_data)

print(scaled_array)

[[-0.37724476 -0.09456046 -0.11506527 ... -1.3224924  -0.66563931
  -0.53156433]
 [-0.91710013 -0.16972581 -0.17968162 ... -1.3224924  -0.66563931
  -0.53156433]
 [-0.95433153 -0.16346203 -0.18180652 ... -1.3224924  -0.66563931
  -0.53156433]
 ...
 [-0.79609806 -0.28403978 -0.19654132 ...  1.67727614 -0.23060891
  -0.199153  ]
 [-0.59132534 -0.28247383 -0.19581156 ...  1.67838757 -0.23060891
  -0.199153  ]
 [ 0.00437713 -0.24175927 -0.18742463 ...  1.67838757 -0.23060891
  -0.199153  ]]
