In [1]:
# Function that returns one year's fits.fz file list.  
from astropy.io import fits
from astropy.utils.data import get_pkg_data_filename
from datetime import datetime, timedelta
from PIL import Image
import matplotlib.pyplot as plt 
import pandas as pd 
from bs4 import BeautifulSoup
import numpy as np 
import logging
import requests
import os
import cv2 


In [2]:
#Download one image from URL, you can plot it before save it
def save_image_as_jpg (image_url):
    
    try:
        original_image_data = fits.open(image_url)[1].data
        im_resized = cv2.resize(original_image_data, (512, 512), interpolation=cv2.INTER_LINEAR)  
        fits_file_name = image_url.rsplit('/', 1)[-1]
        jpg_file_name = fits_file_name[0:12] + '.jpg'
        plt.imsave(jpg_file_name, im_resized, cmap='gray')
    except:
        print('Error During Saving Image File: ', image_url)
        #save to a log file??? 
        pass

In [3]:
#Download only one image per date time. 
def download_image (date_time):
    
    year_string = str(date_time.year)
    month_string = str(date_time.month).zfill(2)
    day_string = str(date_time.day).zfill(2)
    hour_string = str(date_time.hour).zfill(2)
    
    #minute is 00  
    combined_string = year_string + month_string + day_string + hour_string + '00'
    
    day_url = 'https://gong2.nso.edu/HA/haf/' + year_string + month_string + '/' + year_string + month_string + day_string +'/'   
    #check all URLs under base_url directory
    response = requests.get(day_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # find all the links on the webpage
    links = soup.find_all("a")
    
    #If there are links in the page, there are a lot of days the page is blank
    for link in links:
        
        #link text on the page
        hrefText = link.text
         
        #There are too many files per minute, save the first one on the page
        #Should we care about the location? 
        if (hrefText.endswith('.fits.fz') and combined_string in hrefText):
            image_url = os.path.join(day_url, hrefText) 
            
            #download the image from the image_url
            save_image_as_jpg (image_url) 
            
            #print(image_url)
            break; 


In [4]:
#Test download_image function
test_dt = datetime(2018, 1, 1, 0, 0, 0)
download_image (test_dt)

In [5]:
#Download one year at a time, download image took at 0, 6, 12 and 18 hours 
def download_fits_files (year, frenqucy_hours = 6):
    
    start_dt = datetime(year, 1, 1, 0, 0, 0)
    date_time = datetime(year, 1, 1, 0, 0, 0)
    
    while (date_time.year == year): 
        
        download_image(date_time)  
        date_time = date_time + timedelta(hours=frenqucy_hours)
        
        #Unblock this line to test downloading one's images
        if date_time.year > year or date_time.day == 2:
            
        #Unblock this line for production
        #if date_time.year > year:
            break;  
     

In [6]:
#Download one year at a time
#Try download one year first, if it's working, then use for loop
#downloadFiles(2012) 

start_year = 2010
total_years = 10 
for y in (start_year + n for n in range(total_years)):
    #print(y)
    download_fits_files(y) 