### Import Required Libraries and Set Up Environment Variables

In [306]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
from datetime import datetime

## Load the NASA_API_KEY from the env file
load_dotenv()
#Converts the input JSON to a DataFrame

def convertToDF(dfJSON):
    return(pd.json_normalize(dfJSON))

### CME Data

In [307]:
NASA_API_KEY = os.getenv('NASA_API_KEY')


# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI"

specifier = "CME"

# Set the specifier for CMEs:
base_url = base_url
# Search for CMEs published between a begin and end date
startDate = "2024-01-01"
endDate   = "2024-05-01"

# Build URL for CME
cme_url = (f"{base_url}/{specifier}/")

In [308]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(cme_url, params={"api_key": NASA_API_KEY,"startDate": startDate,"endDate": endDate})

In [309]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json= cme_response.json()


In [310]:
# First element - Preview the first result in JSON format
#first_elem = json.dumps(cme_json[0], indent=4)
#print(first_elem)
# All elements - Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json, indent=5))


[
     {
          "activityID": "2024-01-01T17:00:00-CME-001",
          "catalog": "M2M_CATALOG",
          "startTime": "2024-01-01T17:00Z",
          "instruments": [
               {
                    "displayName": "SOHO: LASCO/C2"
               },
               {
                    "displayName": "SOHO: LASCO/C3"
               }
          ],
          "sourceLocation": "",
          "activeRegionNum": null,
          "note": "Faint CME with the source is likely the minor movement of field lines behind the limb in SE in AIA 171 starting around 2024-01-01T16:30Z. Fully covered by data gap in STEREO A.",
          "submissionTime": "2024-01-02T13:40Z",
          "versionId": 1,
          "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/28435/-1",
          "cmeAnalyses": [
               {
                    "isMostAccurate": true,
                    "time21_5": "2024-01-02T01:01Z",
                    "latitude": -64.0,
                    "longitude": null,
   

In [311]:
# Convert cme_json to a Pandas DataFrame 
# Keep only the columns: activityID, startTime, linkedEvents
cme_df = convertToDF(cme_json)
cme_df = cme_df[['activityID', 'startTime','linkedEvents']]
display(cme_df)


Unnamed: 0,activityID,startTime,linkedEvents
0,2024-01-01T17:00:00-CME-001,2024-01-01T17:00Z,
1,2024-01-02T11:09:00-CME-001,2024-01-02T11:09Z,
2,2024-01-02T19:00:00-CME-001,2024-01-02T19:00Z,
3,2024-01-02T19:36:00-CME-001,2024-01-02T19:36Z,[{'activityID': '2024-01-02T18:02:00-FLR-001'}]
4,2024-01-03T03:24:00-CME-001,2024-01-03T03:24Z,
...,...,...,...
429,2024-05-01T06:36:00-CME-001,2024-05-01T06:36Z,
430,2024-05-01T11:36:00-CME-001,2024-05-01T11:36Z,
431,2024-05-01T12:36:00-CME-001,2024-05-01T12:36Z,
432,2024-05-01T17:36:00-CME-001,2024-05-01T17:36Z,


In [312]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_df.dropna(subset=['linkedEvents'], inplace=True)
cme_df

Unnamed: 0,activityID,startTime,linkedEvents
3,2024-01-02T19:36:00-CME-001,2024-01-02T19:36Z,[{'activityID': '2024-01-02T18:02:00-FLR-001'}]
14,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,[{'activityID': '2024-01-06T05:28:00-FLR-001'}...
33,2024-01-10T19:36:00-CME-001,2024-01-10T19:36Z,[{'activityID': '2024-01-10T18:56:00-FLR-001'}]
34,2024-01-10T21:09:00-CME-001,2024-01-10T21:09Z,[{'activityID': '2024-01-10T20:28:00-FLR-001'}]
49,2024-01-14T12:00:00-CME-001,2024-01-14T12:00Z,[{'activityID': '2024-01-14T11:30:00-FLR-001'}]
...,...,...,...
406,2024-04-24T01:48:00-CME-001,2024-04-24T01:48Z,[{'activityID': '2024-04-26T00:17:00-IPS-001'}]
409,2024-04-24T15:05:00-CME-001,2024-04-24T15:05Z,[{'activityID': '2024-04-24T14:02:00-FLR-001'}]
414,2024-04-25T18:24:00-CME-001,2024-04-25T18:24Z,[{'activityID': '2024-04-25T17:03:00-FLR-001'}]
426,2024-04-30T00:36:00-CME-001,2024-04-30T00:36Z,[{'activityID': '2024-04-30T00:46:00-FLR-001'}]


In [313]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Create a new DataFrame from the expanded rows
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for i in cme_df.index:
    activityID       = cme_df.loc[i, 'activityID']         # Get the corresponding value from 'activityID'
    startTime        = cme_df.loc[i, 'startTime']     # Get the corresponding value from 'startTime'    
    linkedEvents_col = cme_df.loc[i, 'linkedEvents']  # Get the list of dictionaries in 'linkedEvents'
    
    # Iterate over each dictionary in the list
    for item in linkedEvents_col:
        # Create a new row with the dictionary and corresponding 'activityID' and 'startTime' value
        expanded_rows.append({'activityID': activityID, 'startTime': startTime, 'linkedEvents': item})

# Create a new DataFrame from the expanded rows
#df_for_loop = pd.DataFrame(expanded_rows)

# Use the head function to show the dataframe
#df_for_loop.head()

In [347]:
# Create a new DataFrame from the expanded rows
df_for_loop = pd.DataFrame(expanded_rows)

# Use the head function to show the dataframe
df_for_loop.head()

Unnamed: 0,activityID,startTime,linkedEvents
0,2024-01-02T19:36:00-CME-001,2024-01-02T19:36Z,{'activityID': '2024-01-02T18:02:00-FLR-001'}
1,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:28:00-FLR-001'}
2,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:41:00-FLR-001'}
3,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:53:00-FLR-001'}
4,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T06:06:00-FLR-001'}


In [348]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
# Log the error or print it for debugging
def extract_activityID_from_dict(input_dict):
    try:
        activityID = input_dict.get("activityID", None)
        return activityID
    except (ValueError, TypeError) as e:
        # Log the error or print it for debugging
        print(f"Error processing input dictionary: {input_dict}. Error: {e}")
        return None

In [351]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
#extract_activityID_from_dict(df_for_loop.loc[0,'linkedEvents'])

df_for_loop['GST_ActivityID'] = df_for_loop.apply(lambda row: extract_activityID_from_dict(row), axis=1)
df_for_loop


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2024-01-02T19:36:00-CME-001,2024-01-02T19:36Z,{'activityID': '2024-01-02T18:02:00-FLR-001'},2024-01-02T19:36:00-CME-001
1,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:28:00-FLR-001'},2024-01-06T08:12:00-CME-001
2,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:41:00-FLR-001'},2024-01-06T08:12:00-CME-001
3,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:53:00-FLR-001'},2024-01-06T08:12:00-CME-001
4,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T06:06:00-FLR-001'},2024-01-06T08:12:00-CME-001
...,...,...,...,...
162,2024-04-24T01:48:00-CME-001,2024-04-24T01:48Z,{'activityID': '2024-04-26T00:17:00-IPS-001'},2024-04-24T01:48:00-CME-001
163,2024-04-24T15:05:00-CME-001,2024-04-24T15:05Z,{'activityID': '2024-04-24T14:02:00-FLR-001'},2024-04-24T15:05:00-CME-001
164,2024-04-25T18:24:00-CME-001,2024-04-25T18:24Z,{'activityID': '2024-04-25T17:03:00-FLR-001'},2024-04-25T18:24:00-CME-001
165,2024-04-30T00:36:00-CME-001,2024-04-30T00:36Z,{'activityID': '2024-04-30T00:46:00-FLR-001'},2024-04-30T00:36:00-CME-001


In [352]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
df_for_loop.dropna(subset=['GST_ActivityID'], inplace=True)
df_for_loop

Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2024-01-02T19:36:00-CME-001,2024-01-02T19:36Z,{'activityID': '2024-01-02T18:02:00-FLR-001'},2024-01-02T19:36:00-CME-001
1,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:28:00-FLR-001'},2024-01-06T08:12:00-CME-001
2,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:41:00-FLR-001'},2024-01-06T08:12:00-CME-001
3,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T05:53:00-FLR-001'},2024-01-06T08:12:00-CME-001
4,2024-01-06T08:12:00-CME-001,2024-01-06T08:12Z,{'activityID': '2024-01-06T06:06:00-FLR-001'},2024-01-06T08:12:00-CME-001
...,...,...,...,...
162,2024-04-24T01:48:00-CME-001,2024-04-24T01:48Z,{'activityID': '2024-04-26T00:17:00-IPS-001'},2024-04-24T01:48:00-CME-001
163,2024-04-24T15:05:00-CME-001,2024-04-24T15:05Z,{'activityID': '2024-04-24T14:02:00-FLR-001'},2024-04-24T15:05:00-CME-001
164,2024-04-25T18:24:00-CME-001,2024-04-25T18:24Z,{'activityID': '2024-04-25T17:03:00-FLR-001'},2024-04-25T18:24:00-CME-001
165,2024-04-30T00:36:00-CME-001,2024-04-30T00:36Z,{'activityID': '2024-04-30T00:46:00-FLR-001'},2024-04-30T00:36:00-CME-001


In [318]:
# print out the datatype of each column in this DataFrame:
print(df_for_loop.dtypes)

activityID        object
startTime         object
linkedEvents      object
GST_ActivityID    object
dtype: object


In [319]:
# Convert the 'GST_ActivityID' column to string format 
df_for_loop['GST_ActivityID'] = df_for_loop['GST_ActivityID'].astype(str) 
# Convert startTime to datetime format  
df_for_loop['startTime'] = pd.to_datetime(df_for_loop['startTime'], format="%Y-%m-%dT%H:%MZ", errors='coerce') 
# Rename startTime to startTime_CME and activityID to cmeID
df_for_loop.rename(columns={"activityID": "cmeID", "startTime": "startTime_CME"}, inplace=True)    
# Drop linkedEvents
df_for_loop.drop(columns=["linkedEvents"])

Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
0,2024-01-02T19:36:00-CME-001,2024-01-02 19:36:00,2024-01-02T19:36:00-CME-001
1,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,2024-01-06T08:12:00-CME-001
2,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,2024-01-06T08:12:00-CME-001
3,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,2024-01-06T08:12:00-CME-001
4,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,2024-01-06T08:12:00-CME-001
...,...,...,...
162,2024-04-24T01:48:00-CME-001,2024-04-24 01:48:00,2024-04-24T01:48:00-CME-001
163,2024-04-24T15:05:00-CME-001,2024-04-24 15:05:00,2024-04-24T15:05:00-CME-001
164,2024-04-25T18:24:00-CME-001,2024-04-25 18:24:00,2024-04-25T18:24:00-CME-001
165,2024-04-30T00:36:00-CME-001,2024-04-30 00:36:00,2024-04-30T00:36:00-CME-001


In [320]:
display(df_for_loop)

Unnamed: 0,cmeID,startTime_CME,linkedEvents,GST_ActivityID
0,2024-01-02T19:36:00-CME-001,2024-01-02 19:36:00,{'activityID': '2024-01-02T18:02:00-FLR-001'},2024-01-02T19:36:00-CME-001
1,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,{'activityID': '2024-01-06T05:28:00-FLR-001'},2024-01-06T08:12:00-CME-001
2,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,{'activityID': '2024-01-06T05:41:00-FLR-001'},2024-01-06T08:12:00-CME-001
3,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,{'activityID': '2024-01-06T05:53:00-FLR-001'},2024-01-06T08:12:00-CME-001
4,2024-01-06T08:12:00-CME-001,2024-01-06 08:12:00,{'activityID': '2024-01-06T06:06:00-FLR-001'},2024-01-06T08:12:00-CME-001
...,...,...,...,...
162,2024-04-24T01:48:00-CME-001,2024-04-24 01:48:00,{'activityID': '2024-04-26T00:17:00-IPS-001'},2024-04-24T01:48:00-CME-001
163,2024-04-24T15:05:00-CME-001,2024-04-24 15:05:00,{'activityID': '2024-04-24T14:02:00-FLR-001'},2024-04-24T15:05:00-CME-001
164,2024-04-25T18:24:00-CME-001,2024-04-25 18:24:00,{'activityID': '2024-04-25T17:03:00-FLR-001'},2024-04-25T18:24:00-CME-001
165,2024-04-30T00:36:00-CME-001,2024-04-30 00:36:00,{'activityID': '2024-04-30T00:46:00-FLR-001'},2024-04-30T00:36:00-CME-001


In [336]:
# print out the datatype of each column in this DataFrame:
print(df_for_loop.dtypes)

cmeID                     object
startTime_CME     datetime64[ns]
linkedEvents              object
GST_ActivityID            object
dtype: object


In [321]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  


### GST Data

In [322]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST


In [323]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response


In [324]:
# Convert the response variable to json and store it as a variable named gst_json

# Preview the first result in JSON format
# Use json.dumps with argument indent=4 to format data


In [325]:
# Convert gst_json to a Pandas DataFrame  

# Keep only the columns: activityID, startTime, linkedEvents


In [326]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME


In [327]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.


In [328]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:


In [329]:
# Convert the 'CME_ActivityID' column to string format 

# Convert the 'gstID' column to string format 

# Convert startTime to datetime format  

# Rename startTime to startTime_GST 

# Drop linkedEvents

# Verify that all steps were executed correctly


In [330]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  


### Merge both datatsets

In [331]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.


In [332]:
# Verify that the new DataFrame has the same number of rows as cme and gst


### Computing the time it takes for a CME to cause a GST

In [333]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [334]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [335]:
# Export data to CSV without the index
