### Part 1: Step 1

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime

In [6]:
### The following is probably the fastest, but gives an error
### tables = pd.read_html("https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares")

response = requests.get('https://www.spaceweatherlive.com/en/solar-activity/top-50-solar-flares')
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table')

In [8]:
def souptable(table):
    for row in table.find_all('tr'):
        yield [col.text for col in row.find_all('td')]

tables = list(souptable(table))

In [29]:
df = pd.DataFrame(tables)[1:]
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
1,1,X28.0,2003/11/04,486,19:29,19:53,20:06,MovieView archive
2,2,X20.0,2001/04/02,9393,21:32,21:51,22:03,MovieView archive
3,3,X17.2,2003/10/28,486,09:51,11:10,11:24,MovieView archive
4,4,X17.0,2005/09/07,808,17:17,17:40,18:03,MovieView archive
5,5,X14.4,2001/04/15,9415,13:19,13:50,13:55,MovieView archive


In [30]:
df.columns = ['rank', 'x_classification', 'date', 'region', 'start_time', 'maximum_time', 'end_time', 'movie']
df.head()

Unnamed: 0,rank,x_classification,date,region,start_time,maximum_time,end_time,movie
1,1,X28.0,2003/11/04,486,19:29,19:53,20:06,MovieView archive
2,2,X20.0,2001/04/02,9393,21:32,21:51,22:03,MovieView archive
3,3,X17.2,2003/10/28,486,09:51,11:10,11:24,MovieView archive
4,4,X17.0,2005/09/07,808,17:17,17:40,18:03,MovieView archive
5,5,X14.4,2001/04/15,9415,13:19,13:50,13:55,MovieView archive


In [35]:
df.drop('movie', axis=1, inplace=True)
df.head()

Unnamed: 0,rank,x_classification,date,region,start_time,maximum_time,end_time
1,1,X28.0,2003/11/04,486,19:29,19:53,20:06
2,2,X20.0,2001/04/02,9393,21:32,21:51,22:03
3,3,X17.2,2003/10/28,486,09:51,11:10,11:24
4,4,X17.0,2005/09/07,808,17:17,17:40,18:03
5,5,X14.4,2001/04/15,9415,13:19,13:50,13:55


In [37]:
from datetime import datetime
df['Start_datetime']= pd.to_datetime(df['date'] + " " + df['start_time'])
df['Maximum_datetime']=pd.to_datetime(df['date'] + " " + df['maximum_time'])
df['End_datetime']=pd.to_datetime(df['date'] + " " + df['end_time'])
df.drop(['date','start_time','maximum_time','end_time'],axis=1,inplace=True)
df.head()

Unnamed: 0,rank,x_classification,region,Start_datetime,Maximum_datetime,End_datetime
1,1,X28.0,486,2003-11-04 19:29:00,2003-11-04 19:53:00,2003-11-04 20:06:00
2,2,X20.0,9393,2001-04-02 21:32:00,2001-04-02 21:51:00,2001-04-02 22:03:00
3,3,X17.2,486,2003-10-28 09:51:00,2003-10-28 11:10:00,2003-10-28 11:24:00
4,4,X17.0,808,2005-09-07 17:17:00,2005-09-07 17:40:00,2005-09-07 18:03:00
5,5,X14.4,9415,2001-04-15 13:19:00,2001-04-15 13:50:00,2001-04-15 13:55:00


### Step 3: Scape the NASA Data

In [104]:
text = requests.get('https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html').text
bs = BeautifulSoup(text, "html.parser")
lines = bs.text.split('\n')

import re
regExpression = re.compile('^\d{4}/\d{2}/\d{2} \d{2}:\d{2} \d{2}/\d{2}')
linesWithData = [l for l in lines if regExpression.match(l)]
nasaDF = pd.DataFrame([l.split() for l in linesWithData])

nasaDF2 = nasaDF.drop(range(15, 24), 1)
nasaDF2.columns = ['start_date', 'start_time', 'end_date', 'end_time', 'start_freq', 'end_freq', 
                            'loc', 'region', 'importance', 'cme_date','cme_time','cpa',
                            'cme_width','cme_speed','phtx']
nasaDF2.head()

Unnamed: 0,start_date,start_time,end_date,end_time,start_freq,end_freq,loc,region,importance,cme_date,cme_time,cpa,cme_width,cme_speed,phtx
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312,PHTX
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878,PHTX
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464,PHTX
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296,PHTX
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712,PHTX


In [105]:
def setNaN(x):
    if x == '----' or x == '-----' or x == '--/--'or x =='--:--':
        return 'NaN'
    else:
        return x
nasaDF2 = nasaDF2.applymap(setNaN)
nasaDF2.head()

Unnamed: 0,start_date,start_time,end_date,end_time,start_freq,end_freq,loc,region,importance,cme_date,cme_time,cpa,cme_width,cme_speed,phtx
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312,PHTX
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878,PHTX
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464,PHTX
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296,PHTX
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712,PHTX


In [87]:
nasaDF2['is_halo'] = nasaDF2.apply(lambda row: row['cpa'] == 'Halo',axis=1)
nasaDF2.loc[nasaDF2['cpa']=='Halo', 'cpa'] = 'NaN'
nasaDF2.head()

Unnamed: 0,start_date,start_time,end_date,end_time,start_freq,end_freq,loc,region,importance,cme_date,cme_time,cpa,cme_width,cme_speed,phtx,is_halo
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74.0,79,312,PHTX,False
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,,360,878,PHTX,True
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,,360,464,PHTX,True
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263.0,165,296,PHTX,False
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133.0,155,712,PHTX,False


In [106]:
nasaDF2['cme_width_lower_bound'] = nasaDF2.apply(lambda row: row['cme_width'][0] == '>',axis=1)
nasaDF2['cme_width'] = nasaDF2['cme_width'].apply(lambda x: x[1:] if x[0] == '>' else x)
nasaDF2.head()

Unnamed: 0,start_date,start_time,end_date,end_time,start_freq,end_freq,loc,region,importance,cme_date,cme_time,cpa,cme_width,cme_speed,phtx,cme_width_lower_bound
0,1997/04/01,14:00,04/01,14:15,8000,4000,S25E16,8026,M1.3,04/01,15:18,74,79,312,PHTX,False
1,1997/04/07,14:30,04/07,17:30,11000,1000,S28E19,8027,C6.8,04/07,14:27,Halo,360,878,PHTX,False
2,1997/05/12,05:15,05/14,16:00,12000,80,N21W08,8038,C1.3,05/12,05:30,Halo,360,464,PHTX,False
3,1997/05/21,20:20,05/21,22:00,5000,500,N05W12,8040,M1.3,05/21,21:00,263,165,296,PHTX,False
4,1997/09/23,21:53,09/23,22:16,6000,2000,S29E25,8088,C1.4,09/23,22:02,133,155,712,PHTX,False


In [117]:
def get_startdatetime(row):
    if row['start_time'] != '24:00':
        return pd.to_datetime(str(row['start_date']) + " " + str(row['start_time']))
    else: 
        return pd.to_datetime(row['start_date']) + pd.tseries.offsets.DayOffset()
def get_enddatetime(row):
    if row['end_time'] != '24:00':
        return pd.to_datetime(str(row['start_date'])[:4] + '/' + str(row['end_date']) + " " + str(row['end_time']))
    else: 
        return pd.to_datetime(str(row['start_date'])[:4] + '/' + str(row['end_date'])) + pd.tseries.offsets.DateOffset()
def get_cmedatetime(row):
    if row['cme_time'] != '24:00':
        return pd.to_datetime(str(row['start_date'])[:4] + '/' + str(row['cme_date']) + " " + str(row['cme_time']))
    else: 
        return pd.to_datetime(str(row['start_date'])[:4] + '/' + str(row['cme_date'])) + pd.tseries.offsets.DateOffset()

nasaDF2['full_start_time'] = nasaDF2.apply(get_startdatetime, axis=1)
nasaDF2['full_end_time'] = nasaDF2.apply(get_enddatetime, axis=1)
nasaDF2['full_cme_time'] = nasaDF2.apply(get_cmedatetime, axis=1)

nasaDF2.drop(['start_time','start_date','end_time','end_date','cme_time','cme_date'], axis=1, inplace=True)

In [118]:
nasaDF2.head()

Unnamed: 0,start_freq,end_freq,loc,region,importance,cpa,cme_width,cme_speed,phtx,cme_width_lower_bound,full_start_time,full_end_time,full_cme_time
0,8000,4000,S25E16,8026,M1.3,74,79,312,PHTX,False,1997-04-01 14:00:00,1997-04-01 14:15:00,1997-04-01 14:15:00
1,11000,1000,S28E19,8027,C6.8,Halo,360,878,PHTX,False,1997-04-07 14:30:00,1997-04-07 17:30:00,1997-04-07 17:30:00
2,12000,80,N21W08,8038,C1.3,Halo,360,464,PHTX,False,1997-05-12 05:15:00,1997-05-14 16:00:00,1997-05-14 16:00:00
3,5000,500,N05W12,8040,M1.3,263,165,296,PHTX,False,1997-05-21 20:20:00,1997-05-21 22:00:00,1997-05-21 22:00:00
4,6000,2000,S29E25,8088,C1.4,133,155,712,PHTX,False,1997-09-23 21:53:00,1997-09-23 22:16:00,1997-09-23 22:16:00


### Analysis Question 1

In [123]:
def is_X_class(s):
    return not pd.isnull(s) and re.match("X", s) is not None
    
def class_to_float(s):
    return float(s[1:])

# boolean vector of whether the row is an X class or not, since all top 50 are X classes.
is_x = nasaDF2['importance'].apply(is_X_class)
is_x_df = nasaDF2[is_x]

# nasa_df[is_x_df['flare_classification'].apply(class_to_float).sort_values(ascending=False).index[:50]]

# convert all X classifications to floats, sort by the value and then take the first 50 indexes and index nasa_df
top_50_df = nasaDF2.loc[is_x_df['importance'].apply(class_to_float).sort_values(ascending=False).index[:50]]
top_50_df

Unnamed: 0,start_freq,end_freq,loc,region,importance,cpa,cme_width,cme_speed,phtx,cme_width_lower_bound,full_start_time,full_end_time,full_cme_time
242,10000,200,S19W83,10486,X28.,Halo,360.0,2657.0,PHTX,False,2003-11-04 20:00:00,2003-11-05 00:00:00,2003-11-05 00:00:00
119,14000,250,N19W72,9393,X20.,261,244.0,2505.0,PHTX,False,2001-04-02 22:05:00,2001-04-03 02:30:00,2001-04-03 02:30:00
234,14000,40,S16E08,10486,X17.,Halo,360.0,2459.0,PHTX,False,2003-10-28 11:10:00,2003-10-30 00:00:00,2003-10-30 00:00:00
128,14000,40,S20W85,9415,X14.,245,167.0,1199.0,PHTX,False,2001-04-15 14:05:00,2001-04-16 13:00:00,2001-04-16 13:00:00
235,11000,500,S15W02,10486,X10.,Halo,360.0,2029.0,PHTX,False,2003-10-29 20:55:00,2003-10-30 00:00:00,2003-10-30 00:00:00
8,14000,100,S18W63,8100,X9.4,Halo,360.0,1556.0,PHTX,False,1997-11-06 12:20:00,1997-11-07 08:30:00,1997-11-07 08:30:00
330,14000,250,S07E68,10930,X9.0,,,,PHTX,False,2006-12-05 10:50:00,2006-12-05 20:00:00,2006-12-05 20:00:00
238,12000,250,S14W56,10486,X8.3,Halo,360.0,2598.0,PHTX,False,2003-11-02 17:30:00,2003-11-03 01:00:00,2003-11-03 01:00:00
290,14000,25,N14W61,10720,X7.1,Halo,360.0,882.0,PHTX,False,2005-01-20 07:15:00,2005-01-20 16:30:00,2005-01-20 16:30:00
360,16000,4000,N17W69,11263,X6.9,Halo,360.0,1610.0,PHTX,False,2011-08-09 08:20:00,2011-08-09 08:35:00,2011-08-09 08:35:00


In [121]:
is_x

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7       True
8       True
9       True
10     False
11     False
12     False
13     False
14      True
15     False
16      True
17     False
18      True
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
452    False
453    False
454    False
455    False
456    False
457    False
458     True
459    False
460    False
461    False
462    False
463    False
464    False
465    False
466    False
467    False
468     True
469    False
470    False
471    False
472    False
473    False
474    False
475    False
476    False
477    False
478    False
479    False
480    False
481    False
Name: importance, dtype: bool