In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import trim
import pandas as pd
import numpy as np
import json
import requests
from datetime import datetime, timedelta
from dateutil import parser

In [0]:
spark = SparkSession.builder.appName('CNEO_data_extractor').getOrCreate()
spark

In [0]:
dbutils.secrets.listScopes()

Out[6]: [SecretScope(name='AzureSecretVault')]

In [0]:
# Azure blob SAS information
# sas_token = '?sv=2021-06-08&ss=bfqt&srt=sco&sp=rwdlacupytfx&se=2023-04-13T18:17:11Z&st=2023-01-05T10:17:11Z&spr=https,http&sig=0YnugsQU3rEqee%2Fd3d6TND1H2Gij%2F79EBE0a8gCPi4U%3D'
# sas_url = 'https://myazurefreetier.blob.core.windows.net/cneosdata?sp=racwd&st=2023-01-05T09:00:09Z&se=2023-03-31T17:00:09Z&spr=https&sv=2021-06-08&sr=c&sig=I3V3ujWiA5X%2FRHVwFwFr5%2BbFoUF6AnsihBrO%2FBWUu38%3D'
storage_account = 'myazurefreetier'
container = 'cneosdata'
# azure_file_path = 'https://myazurefreetier.blob.core.windows.net/cneosdata/rawdata.csv'
mount_point = '/mnt/files'

application_id = 'cd56d143-36ca-46e9-a6ed-a34adf4b8582'
auth_key = dbutils.secrets.get(scope='AzureSecretVault', key='storageaccountsecret')
tenet_id = '45007933-edf0-4605-8c19-83ff82416cf8'

endpoint = "https://login.microsoftonline.com/" + tenet_id + "/oauth2/token"

source = "abfss://" + container + "@" + storage_account + ".dfs.core.windows.net/"

configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": application_id,
          "fs.azure.account.oauth2.client.secret": auth_key,
          "fs.azure.account.oauth2.client.endpoint": endpoint}



In [0]:
# Optionally, you can add <directory-name> to the source URI of your mount point.
if not any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    dbutils.fs.mount(
      source = source,
      mount_point = mount_point,
      extra_configs = configs)

In [0]:
# dbutils.fs.mounts()[4].mountPoint

Out[11]: '/mnt/files'

In [0]:
%fs
ls "mnt/files"

path,name,size,modificationTime
dbfs:/mnt/files/output_file.csv/,output_file.csv/,0,1672921712000
dbfs:/mnt/files/outputdata.csv,outputdata.csv,6593966,1672942102000
dbfs:/mnt/files/rawdata.csv,rawdata.csv,6271663,1672921131000
dbfs:/mnt/files/updatedData.csv,updatedData.csv,6593966,1672945699000


In [0]:
# spark.conf.set("fs.azure.account.auth.type.myazurefreetier.dfs.core.windows.net", "SAS")
# spark.conf.set("fs.azure.sas.token.provider.type.myazurefreetier.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
# spark.conf.set("fs.azure.sas.fixed.token.myazurefreetier.dfs.core.windows.net", "sv=2021-06-08&ss=bfqt&srt=sco&sp=rwdlacupytfx&se=2023-04-13T18:17:11Z&st=2023-01-05T10:17:11Z&spr=https,http&sig=0YnugsQU3rEqee%2Fd3d6TND1H2Gij%2F79EBE0a8gCPi4U%3D")


In [0]:
legacy_data = spark.read.csv('/mnt/files/updatedData.csv', header=True, inferSchema=True)
display(legacy_data)


Designation,Orbit Id,Time of Close approach,Close-Approach Date,Nominal Approch distance (au),Min Close-Approach Distance (au),Max Close-Approach Distance (au),V Reletive (Km/s),V Infinite (Km/s),Close-Approach Uncertain Time,Absolute Magnitude (mag),Diameter (Km),Diameter-Sigma (Km),Object
170903,127,2415020.525,01-01-1900 00:37,0.092594006,0.092334264,0.092853786,16.73983079,16.73811169,00:16,18.19,-1.0,-1.0,170903 (2004 WS2)
2018 BM3,11,2415021.449,01-01-1900 22:47,0.076198767,0.068197916,0.084224814,16.9286503,16.92658459,05:59,21.92,-1.0,-1.0,(2018 BM3)
2020 BN7,6,2415023.595,04-01-1900 02:16,0.089660747,0.088258237,0.091430678,5.258115848,5.252461097,1_13:09,23.8,-1.0,-1.0,(2020 BN7)
2017 MW4,18,2415023.596,04-01-1900 02:18,0.0613005,0.061290749,0.061310252,17.59164184,17.58917085,00:01,20.05,-1.0,-1.0,(2017 MW4)
509352,57,2415024.434,04-01-1900 22:25,0.009631839,0.009624948,0.009638731,8.686711042,8.654806975,00:02,20.16,-1.0,-1.0,509352 (2007 AG)
214869,173,2415026.944,07-01-1900 10:39,0.050168844,0.050160285,0.050177404,11.59433494,11.58975333,00:02,16.51,-1.0,-1.0,214869 (2007 PA8)
2017 QD3,7,2415029.501,10-01-1900 00:01,0.086559929,0.086547534,0.086572323,10.93172731,10.92891112,00:03,22.23,-1.0,-1.0,(2017 QD3)
2014 SC324,56,2415030.547,11-01-1900 01:07,0.039965387,0.039905774,0.040025086,10.65242846,10.64616798,00:18,24.3,-1.0,-1.0,(2014 SC324)
2012 UK171,50,2415032.463,12-01-1900 23:07,0.049822461,0.049496883,0.050148138,7.157379322,7.149903481,00:13,24.44,-1.0,-1.0,(2012 UK171)
2006 YP,15,2415032.955,13-01-1900 10:55,0.096353303,0.096346235,0.096360371,3.832502428,3.825280175,00:09,23.9,-1.0,-1.0,(2006 YP)


In [0]:
last_record = legacy_data.tail(1)
last_record

Out[44]: [Row(Designation='2018 PN22', Orbit Id='6', Time of Close approach=2459949.488279088, Close-Approach Date='2023-01-04 23:43:00', Nominal Approch distance (au)=0.0766612183849406, Min Close-Approach Distance (au)=0.0764144828866867, Max Close-Approach Distance (au)=0.0769079613009998, V Reletive (Km/s)=3.78252394414714, V Infinite (Km/s)=3.7733240374348, Close-Approach Uncertain Time='02:23', Absolute Magnitude (mag)=27.5, Diameter (Km)=-1.0, Diameter-Sigma (Km)=-1.0, Object='       (2018 PN22)')]

In [0]:
last_record_date = last_record[0]['Close-Approach Date']
last_record_date = parser.parse(last_record_date)
last_record_date = last_record_date + timedelta(days=1)
last_record_date = str(last_record_date.date())
last_record_date
# last_record_date = last_record_date.split('-')
# last_record_date = last_record_date[2] + '-' + last_record_date[1] + '-' + last_record_date[0]
# datetime. + timedelta(days=1)

Out[45]: '2023-01-05'

In [0]:
url = "https://ssd-api.jpl.nasa.gov/cad.api"
parameters = {
    "date-min": last_record_date,
    "date-max": str(datetime.today().date() + timedelta(days=5)),
    "dist-max": "0.05",
    'fullname': "true",
    'dist-max': "0.1",
    'diameter': "true"
}
response = requests.get(url, parameters)
data = response.json()


In [0]:
data['count']

Out[47]: '11'

In [0]:
columns = [
    'Designation',
    'Orbit Id',
    'Time of Close approach',
    'Close-Approach Date',
    'Nominal Approch distance (au)',
    'Min Close-Approach Distance (au)',
    'Max Close-Approach Distance (au)',
    'V Reletive (Km/s)',
    'V Infinite (Km/s)',
    'Close-Approach Uncertain Time',
    'Absolute Magnitude (mag)',
    'Diameter (Km)',
    'Diameter-Sigma (Km)',
    'Object'
]


In [0]:
df = pd.DataFrame(data['data'], columns=data['fields'])
df.head()

Unnamed: 0,des,orbit_id,jd,cd,dist,dist_min,dist_max,v_rel,v_inf,t_sigma_f,h,diameter,diameter_sigma,fullname
0,2021 EB3,4,2459950.143874088,2023-Jan-05 15:27,0.0592297330147136,0.0476813453879898,0.0759672428323347,10.8581056833772,10.8539618584731,1_13:39,25.18,,,(2021 EB3)
1,2019 WC5,25,2459950.15143357,2023-Jan-05 15:38,0.0652379562731394,0.0652377366926796,0.0652381758535991,14.4434895143507,14.4406614948599,< 00:01,22.5,,,(2019 WC5)
2,2018 XE4,7,2459950.402238375,2023-Jan-05 21:39,0.0590740776249576,0.0590698675290183,0.0590782877359105,8.20807462059822,8.20257769879846,00:02,26.4,,,(2018 XE4)
3,2022 YL4,6,2459950.405487136,2023-Jan-05 21:44,0.0132541519879111,0.0132297750294862,0.0132785190303174,1.95498689674418,1.84930100156361,00:03,27.62,,,(2022 YL4)
4,2022 YQ6,3,2459950.517113871,2023-Jan-06 00:25,0.0852941106686142,0.0830857897600956,0.087502144134913,4.99066076120032,4.98439739404112,00:27,24.765,,,(2022 YQ6)


In [0]:
df['jd'] = pd.to_numeric(df['jd'])
df['cd'] = pd.to_datetime(df['cd'])
df['dist'] = pd.to_numeric(df['dist'])
df['dist_min'] = pd.to_numeric(df['dist_min'])
df['dist_max'] = pd.to_numeric(df['dist_max'])
df['v_rel'] = pd.to_numeric(df['v_rel'])
df['v_inf'] = pd.to_numeric(df['v_inf'])
df['t_sigma_f'] = df['t_sigma_f'].astype(str)
df['h'] = pd.to_numeric(df['h'])
df['diameter'] = pd.to_numeric(df['diameter'])
df['diameter_sigma'] = pd.to_numeric(df['diameter_sigma'])
df.tail()

Unnamed: 0,des,orbit_id,jd,cd,dist,dist_min,dist_max,v_rel,v_inf,t_sigma_f,h,diameter,diameter_sigma,fullname
6,2022 YV5,2,2459951.0,2023-01-06 05:44:00,0.053566,0.05301,0.054122,10.645016,10.640342,< 00:01,25.232,,,(2022 YV5)
7,2016 AE166,13,2459952.0,2023-01-07 09:12:00,0.092849,0.092849,0.09285,19.553815,19.552347,< 00:01,21.3,,,(2016 AE166)
8,2021 TL,16,2459954.0,2023-01-09 05:43:00,0.036412,0.036412,0.036413,8.450631,8.441967,< 00:01,23.39,,,(2021 TL)
9,226554,101,2459954.0,2023-01-09 13:59:00,0.072538,0.072538,0.072538,8.996481,8.992397,< 00:01,19.64,0.482,0.007,226554 (2003 WR21)
10,2011 EP51,32,2459954.0,2023-01-09 23:16:00,0.085092,0.085091,0.085093,10.096146,10.093044,< 00:01,25.2,,,(2011 EP51)


In [0]:
df.columns = columns
df.head(1)

Unnamed: 0,Designation,Orbit Id,Time of Close approach,Close-Approach Date,Nominal Approch distance (au),Min Close-Approach Distance (au),Max Close-Approach Distance (au),V Reletive (Km/s),V Infinite (Km/s),Close-Approach Uncertain Time,Absolute Magnitude (mag),Diameter (Km),Diameter-Sigma (Km),Object
0,2021 EB3,4,2459950.0,2023-01-05 15:27:00,0.05923,0.047681,0.075967,10.858106,10.853962,1_13:39,25.18,,,(2021 EB3)


In [0]:
nsdf = spark.createDataFrame(df)
nsdf

Out[52]: DataFrame[Designation: string, Orbit Id: string, Time of Close approach: double, Close-Approach Date: timestamp, Nominal Approch distance (au): double, Min Close-Approach Distance (au): double, Max Close-Approach Distance (au): double, V Reletive (Km/s): double, V Infinite (Km/s): double, Close-Approach Uncertain Time: string, Absolute Magnitude (mag): double, Diameter (Km): double, Diameter-Sigma (Km): double, Object: string]

In [0]:
nsdf = nsdf.fillna(value=-1)
# nsdf.show(1)

In [0]:
updated_data = legacy_data.union(nsdf)

In [0]:
print(updated_data.count())
print(legacy_data.count())
print(nsdf.count())


46804
46793
11


In [0]:
# updated_data.show(2)

In [0]:
updated_data = updated_data.fillna(value=-1)

In [0]:
# updated_data.show(2)

In [0]:
if data['count'] != "0":
    updated_data.toPandas().to_csv('/dbfs/mnt/files/updatedData.csv', index = False)
else:
    print("There were no new records fetched today")