### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

### CME Data

In [2]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-01-01"
specifier = "CME"

# Build URL for CME
# Construct the query URL
query_url_CME = f"{base_url}{specifier}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"
print(query_url_CME)

https://api.nasa.gov/DONKI/CME?startDate=2013-05-01&endDate=2024-01-01&api_key=Jb7D1tPRv1WDalCNZoDQ5AR7EOeGGTNpsIOrWC24


In [3]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(query_url_CME)
print(cme_response)

<Response [200]>


In [4]:
# Convert the response variable to json and store it as a variable named cme_json

if cme_response.status_code == 200:
    print("Successfully retrieved CME data.")
    # Convert the response to JSON
    try:
        cme_json = cme_response.json()
        print("Successfully converted response to JSON.")
    except json.JSONDecodeError:
        print("Error decoding JSON from the response.")
        cme_json = None
else:
    print(f"Error: {cme_response.status_code}")
    cme_json = None


Successfully retrieved CME data.
Successfully converted response to JSON.


In [5]:
# Preview the first result in JSON format
print(json.dumps(cme_json[0], indent=4))

# Use json.dumps with argument indent=4 to format data


{
    "activityID": "2013-05-01T03:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2013-05-01T03:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        },
        {
            "displayName": "STEREO B: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "",
    "submissionTime": "2013-08-07T16:54Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/2349/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2013-05-01T07:07Z",
            "latitude": 12.0,
            "longitude": -120.0,
            "halfAngle": 36.0,
            "speed": 860.0,
            "type": "C",
            "featureCode": "null",
            "imageType": null,
            "measurementTechnique": "null",
   

In [5]:
# Convert cme_json to a Pandas DataFrame 
cme = pd.DataFrame(cme_json)

# Keep only the columns: activityID, startTime, linkedEvents
cme = cme[["activityID","startTime","linkedEvents"]]
cme.head(5)

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
1,2013-05-02T05:24:00-CME-001,2013-05-02T05:24Z,
2,2013-05-02T14:36:00-CME-001,2013-05-02T14:36Z,
3,2013-05-03T18:00:00-CME-001,2013-05-03T18:00Z,
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]


In [6]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme.dropna(subset="linkedEvents", inplace = True)
cme.head(10)

Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,[{'activityID': '2013-05-04T04:52:00-IPS-001'}]
4,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,[{'activityID': '2013-05-07T04:37:00-IPS-001'}]
7,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,[{'activityID': '2013-05-12T23:30:00-IPS-001'}]
10,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,[{'activityID': '2013-05-13T01:53:00-FLR-001'}...
13,2013-05-13T16:18:00-CME-001,2013-05-13T16:18Z,[{'activityID': '2013-05-13T15:40:00-FLR-001'}...
14,2013-05-14T01:30:00-CME-001,2013-05-14T01:30Z,[{'activityID': '2013-05-14T01:00:00-FLR-001'}]
15,2013-05-15T02:18:00-CME-001,2013-05-15T02:18Z,[{'activityID': '2013-05-15T01:25:00-FLR-001'}...
17,2013-05-17T09:24:00-CME-001,2013-05-17T09:24Z,[{'activityID': '2013-05-19T22:20:00-IPS-001'}]
18,2013-05-18T03:24:00-CME-001,2013-05-18T03:24Z,[{'activityID': '2013-05-19T18:53:00-IPS-001'}]
20,2013-05-22T09:12:00-CME-001,2013-05-22T09:12Z,[{'activityID': '2013-05-25T20:00:00-IPS-001'}]


In [7]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element 

# Initialize an empty list to store the expanded rows
expanded_rows = []

# Iterate over each index in the DataFrame
for x in cme.index:
    activityID = cme.loc[x,"activityID"]
    startTime = cme.loc[x,"startTime"]
    linkedEvents = cme.loc[x,"linkedEvents"]
    # Iterate over each dictionary in the list
    for item in linkedEvents:
        # Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
        expanded_rows.append({"activityID":activityID,"startTime":startTime, "linkedEvents": item})
# Create a new DataFrame from the expanded rows 
cme_clean= pd.DataFrame(expanded_rows)
cme_clean.head(5)


Unnamed: 0,activityID,startTime,linkedEvents
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'}
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'}
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'}
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'}
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'}


In [8]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors

        # Log the error or print it for debugging

def extract_activityID_from_dict(input_dict):
    try:
        activityID = input_dict.get('activityID', None)
        return activityID
    except (ValueError, TypeError) as e:
        # Log the error or print it for debugging
        print(f"Error processing input dictionary: {input_dict}. Error: {e}")
        return None

extract_activityID_from_dict(cme_clean.loc[0,'linkedEvents'])



'2013-05-04T04:52:00-IPS-001'

In [9]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:

cme_clean.loc[:, 'GST_ActivityID'] = cme_clean['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
cme_clean.head()


Unnamed: 0,activityID,startTime,linkedEvents,GST_ActivityID
0,2013-05-01T03:12:00-CME-001,2013-05-01T03:12Z,{'activityID': '2013-05-04T04:52:00-IPS-001'},2013-05-04T04:52:00-IPS-001
1,2013-05-03T22:36:00-CME-001,2013-05-03T22:36Z,{'activityID': '2013-05-07T04:37:00-IPS-001'},2013-05-07T04:37:00-IPS-001
2,2013-05-09T19:29:00-CME-001,2013-05-09T19:29Z,{'activityID': '2013-05-12T23:30:00-IPS-001'},2013-05-12T23:30:00-IPS-001
3,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T01:53:00-FLR-001'},2013-05-13T01:53:00-FLR-001
4,2013-05-13T02:54:00-CME-001,2013-05-13T02:54Z,{'activityID': '2013-05-13T04:12:00-SEP-001'},2013-05-13T04:12:00-SEP-001


In [10]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
cme_clean.dropna(subset="GST_ActivityID", inplace = True)


In [11]:
# print out the datatype of each column in this DataFrame:
cme_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1547 entries, 0 to 1546
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   activityID      1547 non-null   object
 1   startTime       1547 non-null   object
 2   linkedEvents    1547 non-null   object
 3   GST_ActivityID  1547 non-null   object
dtypes: object(4)
memory usage: 48.5+ KB


In [12]:
# Convert the 'GST_ActivityID' column to string format 
cme_clean['GST_ActivityID'] = pd.Series(cme_clean['GST_ActivityID'], dtype="string")
# Convert startTime to datetime format  
cme_clean['startTime'] = pd.to_datetime(cme_clean['startTime'])
# Rename startTime to startTime_CME and activityID to cmeID
cme_clean.rename(columns={'startTime':'startTime_CME'}, inplace=True)
cme_clean.rename(columns={'activityID':'cmeID'}, inplace=True)
# Drop linkedEvents
cme_clean.drop(columns=['linkedEvents'], inplace=True)
# Verify that all steps were executed correctly


In [13]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
cme_clean = cme_clean[cme_clean['GST_ActivityID'].str.contains('GST')]
cme_clean.head()# use the method 'contains()' from the str library.  


Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
21,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001
48,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001
90,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001
148,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001
151,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001


### GST Data

In [14]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST?"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
#https://api.nasa.gov/DONKI/GST?startDate=2016-01-01&endDate=2016-01-30&api_key=DEMO_KEY
GST_URL= f" {base_url}{GST}startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [16]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response=requests.get(GST_URL)
print(gst_response)

<Response [200]>


In [18]:
# Convert the response variable to json and store it as a variable named gst_json

if gst_response.status_code == 200:
    print("Successfully retrieved GST data.")
    # Convert the response to JSON
    try:
        gst_json = gst_response.json()
        print("Successfully converted response to JSON.")
    except json.JSONDecodeError:
        print("Error decoding JSON from the response.")
        gst_json = None
else:
    print(f"Error: {gst_response.status_code}")
    gst_json = None


# Preview the first result in JSON format
print(json.dumps(gst_json[0], indent=4))

# Use json.dumps with argument indent=4 to format data


Successfully retrieved GST data.
Successfully converted response to JSON.
{
    "gstID": "2013-06-01T01:00:00-GST-001",
    "startTime": "2013-06-01T01:00Z",
    "allKpIndex": [
        {
            "observedTime": "2013-06-01T01:00Z",
            "kpIndex": 6.0,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
    "linkedEvents": [
        {
            "activityID": "2013-05-31T15:45:00-HSS-001"
        }
    ],
    "submissionTime": "2013-07-15T19:26Z",
    "versionId": 1
}


In [19]:
# Convert gst_json to a Pandas DataFrame  
gst = pd.DataFrame(gst_json)
gst.head(5)
# Keep only the columns: activityID, startTime, linkedEvents
gst = gst[["gstID","startTime","linkedEvents"]]
gst.head(5)
# Keep only the columns: activityID, startTime, linkedEvents


Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
2,2013-06-29T03:00:00-GST-001,2013-06-29T03:00Z,
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...


In [20]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
gst.dropna(subset="linkedEvents", inplace = True)

# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
gst.head(4)

Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,[{'activityID': '2013-05-31T15:45:00-HSS-001'}]
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,[{'activityID': '2013-06-02T20:24:00-CME-001'}]
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,[{'activityID': '2013-09-29T22:40:00-CME-001'}...
4,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,[{'activityID': '2013-12-04T23:12:00-CME-001'}...


In [21]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Initialize an empty list to store the expanded rows

gst_clean= gst.explode("linkedEvents", ignore_index= True ).dropna()
gst_clean.head()

Unnamed: 0,gstID,startTime,linkedEvents
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,{'activityID': '2013-05-31T15:45:00-HSS-001'}
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,{'activityID': '2013-06-02T20:24:00-CME-001'}
2,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-09-29T22:40:00-CME-001'}
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T01:54:00-IPS-001'}
4,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T02:47:00-MPC-001'}


In [22]:
gst_clean.head(5)
gst_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   gstID         205 non-null    object
 1   startTime     205 non-null    object
 2   linkedEvents  205 non-null    object
dtypes: object(3)
memory usage: 4.9+ KB


In [23]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:
# Verify 'linkedEvents' column exists in the DataFrame

gst_clean.loc[:, 'CME_ActivityID'] = gst_clean['linkedEvents'].apply(lambda x: extract_activityID_from_dict(x))
gst_clean.dropna(subset='CME_ActivityID', inplace = True)
gst_clean.head()

Unnamed: 0,gstID,startTime,linkedEvents,CME_ActivityID
0,2013-06-01T01:00:00-GST-001,2013-06-01T01:00Z,{'activityID': '2013-05-31T15:45:00-HSS-001'},2013-05-31T15:45:00-HSS-001
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,{'activityID': '2013-06-02T20:24:00-CME-001'},2013-06-02T20:24:00-CME-001
2,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-09-29T22:40:00-CME-001'},2013-09-29T22:40:00-CME-001
3,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T01:54:00-IPS-001'},2013-10-02T01:54:00-IPS-001
4,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-10-02T02:47:00-MPC-001'},2013-10-02T02:47:00-MPC-001


In [23]:

# Convert the 'gstID' column to string format 
gst_clean['CME_ActivityID'] = pd.Series(gst_clean['CME_ActivityID'], dtype="string")
gst_clean['gstID'] = pd.Series(gst_clean['gstID'], dtype="string")

# Convert startTime to datetime format  
gst_clean['startTime'] = pd.to_datetime(gst_clean['startTime'])

# Rename startTime to startTime_GST 
gst_clean.rename(columns={'startTime':'startTime_GST'}, inplace=True)

# Drop linkedEvents
gst_clean.drop(columns=['linkedEvents'], inplace=True)

# Verify that all steps were executed correctly
gst_clean.head()




Unnamed: 0,gstID,startTime_GST,CME_ActivityID
0,2013-06-01T01:00:00-GST-001,2013-06-01 01:00:00+00:00,2013-05-31T15:45:00-HSS-001
1,2013-06-07T03:00:00-GST-001,2013-06-07 03:00:00+00:00,2013-06-02T20:24:00-CME-001
2,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-09-29T22:40:00-CME-001
3,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-10-02T01:54:00-IPS-001
4,2013-10-02T03:00:00-GST-001,2013-10-02 03:00:00+00:00,2013-10-02T02:47:00-MPC-001


In [24]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'

# Filter gst_clean to keep only rows where 'GST_ActivityID' contains 'CME'
gst_clean = gst_clean[gst_clean['CME_ActivityID'].str.contains('CME')]

# Display the first few rows to verify the results
gst_clean.head()


Unnamed: 0,gstID,startTime,linkedEvents,CME_ActivityID
1,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,{'activityID': '2013-06-02T20:24:00-CME-001'},2013-06-02T20:24:00-CME-001
2,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-09-29T22:40:00-CME-001'},2013-09-29T22:40:00-CME-001
5,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,{'activityID': '2013-12-04T23:12:00-CME-001'},2013-12-04T23:12:00-CME-001
7,2014-02-19T03:00:00-GST-001,2014-02-19T03:00Z,{'activityID': '2014-02-16T14:15:00-CME-001'},2014-02-16T14:15:00-CME-001
9,2014-02-20T03:00:00-GST-001,2014-02-20T03:00Z,{'activityID': '2014-02-18T01:25:00-CME-001'},2014-02-18T01:25:00-CME-001


### Merge both datatsets

In [25]:
# Check columns in both DataFrames
print("Columns in gst DataFrame:", gst_clean.columns)
print("Columns in cme DataFrame:", cme_clean.columns)
cme_clean.head()

Columns in gst DataFrame: Index(['gstID', 'startTime', 'linkedEvents', 'CME_ActivityID'], dtype='object')
Columns in cme DataFrame: Index(['cmeID', 'startTime_CME', 'GST_ActivityID'], dtype='object')


Unnamed: 0,cmeID,startTime_CME,GST_ActivityID
21,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001
48,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001
90,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001
148,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001
151,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001


In [28]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.
combined_data_df =  pd.merge(gst_clean, cme_clean,left_on = ["gstID", "CME_ActivityID"],right_on=["GST_ActivityID","cmeID"] )


combined_data_df.head()

Unnamed: 0,gstID,startTime,linkedEvents,CME_ActivityID,cmeID,startTime_CME,GST_ActivityID
0,2013-06-07T03:00:00-GST-001,2013-06-07T03:00Z,{'activityID': '2013-06-02T20:24:00-CME-001'},2013-06-02T20:24:00-CME-001,2013-06-02T20:24:00-CME-001,2013-06-02 20:24:00+00:00,2013-06-07T03:00:00-GST-001
1,2013-10-02T03:00:00-GST-001,2013-10-02T03:00Z,{'activityID': '2013-09-29T22:40:00-CME-001'},2013-09-29T22:40:00-CME-001,2013-09-29T22:40:00-CME-001,2013-09-29 22:40:00+00:00,2013-10-02T03:00:00-GST-001
2,2013-12-08T00:00:00-GST-001,2013-12-08T00:00Z,{'activityID': '2013-12-04T23:12:00-CME-001'},2013-12-04T23:12:00-CME-001,2013-12-04T23:12:00-CME-001,2013-12-04 23:12:00+00:00,2013-12-08T00:00:00-GST-001
3,2014-02-19T03:00:00-GST-001,2014-02-19T03:00Z,{'activityID': '2014-02-16T14:15:00-CME-001'},2014-02-16T14:15:00-CME-001,2014-02-16T14:15:00-CME-001,2014-02-16 14:15:00+00:00,2014-02-19T03:00:00-GST-001
4,2014-02-20T03:00:00-GST-001,2014-02-20T03:00Z,{'activityID': '2014-02-18T01:25:00-CME-001'},2014-02-18T01:25:00-CME-001,2014-02-18T01:25:00-CME-001,2014-02-18 01:25:00+00:00,2014-02-20T03:00:00-GST-001


### Computing the time it takes for a CME to cause a GST

In [30]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.
combined_data_df["timeDiff"]= combined_data_df["startTime_GST"]- combined_data_df["startTime_CME"]


TypeError: cannot subtract DatetimeArray from ndarray

In [32]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 
combined_data_df.info()
combined_data_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   gstID           57 non-null     object             
 1   startTime       57 non-null     object             
 2   linkedEvents    57 non-null     object             
 3   CME_ActivityID  57 non-null     object             
 4   cmeID           57 non-null     object             
 5   startTime_CME   57 non-null     datetime64[ns, UTC]
 6   GST_ActivityID  57 non-null     string             
dtypes: datetime64[ns, UTC](1), object(5), string(1)
memory usage: 3.2+ KB


Unnamed: 0,gstID,startTime,linkedEvents,CME_ActivityID,cmeID,startTime_CME,GST_ActivityID
count,57,57,57,57,57,57,57
unique,45,45,55,55,55,,45
top,2021-11-03T21:00:00-GST-001,2021-11-03T21:00Z,{'activityID': '2017-09-06T12:24:00-CME-001'},2017-09-06T12:24:00-CME-001,2017-09-06T12:24:00-CME-001,,2021-11-03T21:00:00-GST-001
freq,3,3,2,2,2,,3
mean,,,,,,2018-11-23 21:57:06.315789824+00:00,
min,,,,,,2013-06-02 20:24:00+00:00,
25%,,,,,,2015-09-04 14:12:00+00:00,
50%,,,,,,2017-09-06 12:24:00+00:00,
75%,,,,,,2022-08-13 18:48:00+00:00,
max,,,,,,2023-12-14 17:38:00+00:00,


### Exporting data in csv format

In [33]:
# Export data to CSV without the index
combined_data_df.to_csv("NASA.csv",index = False)