# Question 4 :
Write a program to download the data from the link given below and then read the data and convert the into
the proper structure and return it as a CSV file.

Link - https://data.nasa.gov/resource/y77d-th95.json

**Excepted Output Data Attributes:**
* name - Name of Earth Meteorite - string 
* id - ID of Earth Meteorite - int 
* nametype - string 
* recclass - string
* mass - Mass of Earth Meteorite - float 
* year - Year at which Earth Meteorite was hit - datetime format 
* reclat - float 
* recclong - float
* point coordinates - list of int

In [31]:
import json
import numpy as np
import pandas as pd
from urllib.request import urlopen
from pprint import pprint

def json_bytes_to_csv(url:str, output_file:str)->None:
    '''
    Takes a url, scrapes data from the url, cleans and parses the json and converts that json into a tabular format on a csv file
    params:
        url: (str) url of the remote resource
        output_file: (str) output file name with extension
    '''
    raw_data = urlopen(url).read()                     # using the urlopen to get the raw json data through secure SSL gateway

    string_data = raw_data.decode('utf-8')             # we received data in a bytestream. Encoding it using utf-8

    parsed_json = json.loads(string_data)              # parsing the data to json

    df = pd.DataFrame(parsed_json)                     # converting parsed json to dataframe
    
    # Data Cleaning
    df = df.iloc[:, :-2]                               # Dropping the last two unrequired columns
    df['point_coordinates'] = df.geolocation.apply(lambda x: x if x is np.nan else x['coordinates']) # extracting point coordinates from the column geolocation
    df.drop(['geolocation', 'fall'], axis = 1, inplace = True) # Dropping unrequired columns according to mapping

    pprint (df.head())                                 # printing sample data on the console for validation
    
    df.to_csv(output_file, index = False)              # converting json to csv

if __name__ == "__main__":
    url = 'https://data.nasa.gov/resource/y77d-th95.json'
    output_file = 'nasa.csv'
    
    json_bytes_to_csv(url, output_file)

       name   id nametype     recclass    mass                     year  \
0    Aachen    1    Valid           L5      21  1880-01-01T00:00:00.000   
1    Aarhus    2    Valid           H6     720  1951-01-01T00:00:00.000   
2      Abee    6    Valid          EH4  107000  1952-01-01T00:00:00.000   
3  Acapulco   10    Valid  Acapulcoite    1914  1976-01-01T00:00:00.000   
4   Achiras  370    Valid           L6     780  1902-01-01T00:00:00.000   

       reclat      reclong     point_coordinates  
0   50.775000     6.083330     [6.08333, 50.775]  
1   56.183330    10.233330  [10.23333, 56.18333]  
2   54.216670  -113.000000      [-113, 54.21667]  
3   16.883330   -99.900000     [-99.9, 16.88333]  
4  -33.166670   -64.950000   [-64.95, -33.16667]  


### Reading generated csv

In [32]:
pd.read_csv(output_file)

Unnamed: 0,name,id,nametype,recclass,mass,year,reclat,reclong,point_coordinates
0,Aachen,1,Valid,L5,21.0,1880-01-01T00:00:00.000,50.77500,6.08333,"[6.08333, 50.775]"
1,Aarhus,2,Valid,H6,720.0,1951-01-01T00:00:00.000,56.18333,10.23333,"[10.23333, 56.18333]"
2,Abee,6,Valid,EH4,107000.0,1952-01-01T00:00:00.000,54.21667,-113.00000,"[-113, 54.21667]"
3,Acapulco,10,Valid,Acapulcoite,1914.0,1976-01-01T00:00:00.000,16.88333,-99.90000,"[-99.9, 16.88333]"
4,Achiras,370,Valid,L6,780.0,1902-01-01T00:00:00.000,-33.16667,-64.95000,"[-64.95, -33.16667]"
...,...,...,...,...,...,...,...,...,...
995,Tirupati,24009,Valid,H6,230.0,1934-01-01T00:00:00.000,13.63333,79.41667,"[79.41667, 13.63333]"
996,Tissint,54823,Valid,Martian (shergottite),7000.0,2011-01-01T00:00:00.000,29.48195,-7.61123,"[-7.61123, 29.48195]"
997,Tjabe,24011,Valid,H6,20000.0,1869-01-01T00:00:00.000,-7.08333,111.53333,"[111.53333, -7.08333]"
998,Tjerebon,24012,Valid,L5,16500.0,1922-01-01T00:00:00.000,-6.66667,106.58333,"[106.58333, -6.66667]"
