In [55]:
# import packages 
import cv2
import numpy as np
import os
import datetime as dt
import time
import pandas as pd

# define your own folder paths and names
project_path = os.getcwd()
pv_data_folder = os.path.join(project_path,'pv_data','pv_output_valid.pkl')

image_folder = os.path.join(project_path,'snapshot_highfreq')
image_name_format = '%Y%m%d%H%M%S'

output_folder =  os.path.join(project_path,'data_expanded')
pred_folder = os.path.join(output_folder,'data_nowcast')

# Operating parameter
output_img_shape = [64, 64, 3] # down-size the high-res image to this resolution
start_date = dt.datetime(2017,3,9)
end_date = dt.datetime(2017,12,31) # until datetime.datetime(2018,10,4,23,59)

# Setting up test set
sunny_day = [(2017,3,14),(2017,5,20),(2017,6,4),(2017,7,6),(2017,8,19),(2017,10,7),(2017,11,1),(2017,12,26)]
cloudy_day = [(2017,3,15),(2017,5,24),(2017,7,5),(2017,9,6),(2017,9,22),(2017,11,4),(2017,12,29)]

sunny_datetime = [dt.datetime(day[0],day[1],day[2]) for day in sunny_day]
cloudy_datetime = [dt.datetime(day[0],day[1],day[2]) for day in cloudy_day]
test_dates = sunny_datetime + cloudy_datetime

In [56]:
def find_idx_with_dates(all_times,test_dates):
    idx=[]
    for test_day in test_dates:
        test_day_end = test_day + dt.timedelta(days = 1)
        idx+=np.nonzero((all_times>test_day)*(all_times<test_day_end))[0].tolist()
    return idx

# This two function does the same thing. Just that one is for np, the other for pd.
def find_time_within_nparray(time_array,time_point):
    probable_idx = np.searchsorted(time_array,time_point)
    
    # If the time point is after all the time in pv_data
    if probable_idx == len(time_array):
        return None   
    
    # See if the time point is actually a match 
    if time_array[probable_idx]== time_point: 
        return probable_idx
        
    else:
        return None

def find_time_within_pdseries(time_array,time_point):
    probable_idx = np.searchsorted(time_array,time_point)
    
    # If the time point is after all the time in pv_data
    if probable_idx == len(time_array):
        return None   
    
    # See if the time point is actually a match 
    if time_array[probable_idx]== time_point: 
        return probable_idx
        
    else:
        return None

def listdir_noini(path):
    f_list = []
    for f in os.listdir(path):
        _, file_extension = os.path.splitext(f)
        if not file_extension == '.ini':
            f_list.append(f)    
    
    return sorted(f_list)
    
# return a list list all the .jpg files in 
def listdir_jpg(path):
    f_list = []
    for f in os.listdir(path):
        _, file_extension = os.path.splitext(f)
        if file_extension == '.jpg':
            f_list.append(f)
            
    return sorted(f_list)

Read in PV output 

In [57]:
# Read in the pv output file
pv_output_all = pd.read_pickle(os.path.join(pv_data_folder))

# Obtain relevant data
pv_output_all = pv_output_all.loc[start_date:end_date]

Read in image paths

In [59]:
# Initialization: find all image file path
image_paths = []
all_times = []

years_list = listdir_noini(os.path.join(image_folder))
for year in years_list: # cycle through the years
    months_list = listdir_noini(os.path.join(image_folder,year))
    for month in months_list: # cycle through the months
        dates_list  = listdir_noini(os.path.join(image_folder,year,month))
        for date in dates_list: # cycle through each day
            image_filenames = listdir_jpg(os.path.join(image_folder,year,month,date))
            for filename in image_filenames:
                image_paths.append(os.path.join(image_folder,year,month,date,filename))
                all_times.append(dt.datetime.strptime(filename,image_name_format+'.jpg'))

# Only keep the one in the given time frame
image_paths = np.array(image_paths)
all_times = np.array(all_times)
print(all_times.shape)
in_range = (all_times>=start_date) & (all_times<end_date)
image_paths = image_paths[in_range]
all_times = all_times[in_range]

(131382,)


### Load in each image, and save ones with valid PV_output

In [60]:
# Save images from snapshot folder to an npy file
# Designed for the high frequency (10s) dataset
number_images = len(image_paths)

all_images = np.ndarray([number_images] + output_img_shape,dtype='uint8')
pv_outputs = np.zeros(number_images)
validity_mask = np.zeros(number_images,dtype = bool)

# time counter
tic = time.process_time()

# Step through every snapshot, find if they have valid corresponding PV data

for i,image_path in enumerate(image_paths): # cycle through each file of the day
    # parse timestamp from image name
    curr_time = all_times[i]
    
    # test if there is corresponding data in PV data
    pv_idx = find_time_within_pdseries(pv_output_all.index,curr_time)
    if pv_idx is None:# if prediction ground truth not found
        print(curr_time,'has no PV record')
    else: 
        validity_mask[i] = 1
        frame = cv2.imread(os.path.join(image_path))
        # resizing the image to output_img_shape
        resizing_ratio = output_img_shape[0] / frame.shape[0]
        all_images[i] = cv2.resize(frame, None, fx=resizing_ratio, fy=resizing_ratio)
        pv_outputs[i] = pv_output_all.iloc[pv_idx]
    
    # prompt progress
    if i%100 == 0:
        print('processed {0} images/{1} images'.format(i,image_paths.size))
        if i%1000 == 0 and i>0:
            print('Expected finishing time:', dt.datetime.now()+
                  dt.timedelta(seconds = (time.process_time() - tic)*(image_paths.size/i-1)))
                                      
        
print('time_elapsed',time.process_time()-tic)

2017-03-09 06:00:00 has no PV record
processed 0 images/130543 images
2017-03-09 06:01:00 has no PV record
2017-03-09 06:02:00 has no PV record
2017-03-09 06:03:00 has no PV record
2017-03-09 06:04:00 has no PV record
2017-03-09 06:05:00 has no PV record
2017-03-09 06:06:00 has no PV record
2017-03-09 06:07:00 has no PV record
2017-03-09 06:08:00 has no PV record
2017-03-09 06:09:00 has no PV record
2017-03-09 06:10:00 has no PV record
2017-03-09 06:11:00 has no PV record
2017-03-09 06:12:00 has no PV record
2017-03-09 06:13:00 has no PV record
2017-03-09 06:14:00 has no PV record
2017-03-09 06:15:00 has no PV record
2017-03-09 06:16:00 has no PV record
2017-03-09 06:17:00 has no PV record
2017-03-09 06:18:00 has no PV record
2017-03-09 06:19:00 has no PV record
2017-03-09 06:20:00 has no PV record
2017-03-09 06:21:00 has no PV record
2017-03-09 06:22:00 has no PV record
2017-03-09 06:23:00 has no PV record
2017-03-09 06:24:00 has no PV record
2017-03-09 06:25:00 has no PV record
2017-

In [61]:
# Cropping relevant information
all_times = all_times[validity_mask]
all_images = all_images[validity_mask]
pv_outputs = pv_outputs[validity_mask]
        
#storing information
np.save(os.path.join(output_folder,'all_times_highfreq.npy'),all_times)
np.save(os.path.join(output_folder,'all_images_highfreq.npy'), all_images)
np.save(os.path.join(output_folder,'pv_outputs_highfreq.npy'), pv_outputs)

In [62]:
all_times.shape

(7381,)

In [63]:
all_images.shape

(7381, 64, 64, 3)

### Filter out repeating images

In [64]:
# Load back in the previously processed times, images, and PV_outputs
all_times = np.load(os.path.join(output_folder,'all_times_highfreq.npy'), allow_pickle = True)
all_images = np.load(os.path.join(output_folder,'all_images_highfreq.npy'), allow_pickle = True)
pv_outputs = np.load(os.path.join(output_folder,'pv_outputs_highfreq.npy'), allow_pickle = True)

In [65]:
# Filter for repeating images (the OpenCV video capture function would sometimes fail to seek the correct frame 
# and instead erroneously repeat the current frame)
all_images_16 = all_images.astype('int16')

# Calculated the change within two continuous image
all_images_diff = all_images_16[1:] - all_images_16[:-1]
all_images_diff_sum = np.sum(np.abs(all_images_diff), axis = (1,2,3))

# create mask that record repeating status
is_repeating_mask = np.zeros(all_images.shape[0],dtype = bool)
is_repeating_mask[1:] = (all_images_diff_sum == 0)

# Cropping relevant information
all_times = all_times[~is_repeating_mask]
all_images = all_images[~is_repeating_mask]
pv_outputs = pv_outputs[~is_repeating_mask]

In [66]:
all_times.shape

(7163,)

In [67]:
#storing information
np.save(os.path.join(output_folder,'all_times_highfreq.npy'),all_times)
np.save(os.path.join(output_folder,'all_images_highfreq.npy'), all_images)
np.save(os.path.join(output_folder,'pv_outputs_highfreq.npy'), pv_outputs)

In [68]:
def store_trainval_test(all_times,image_log,pv_log,pred_folder):
    
    ## Splitting into Trainval and Test set 
    idx_test = find_idx_with_dates(all_times,test_dates)
    image_log_test = image_log[idx_test]
    pv_log_test = pv_log[idx_test]
    times_test = all_times[idx_test]

    # the rest become the trainval set
    mask_trainval = np.ones_like(pv_log,dtype = bool)
    mask_trainval[idx_test] = 0
    image_log_trainval = image_log[mask_trainval]
    pv_log_trainval = pv_log[mask_trainval]
    times_trainval = all_times[mask_trainval]
    
    print("times_trainval.shape",times_trainval.shape)
    print("image_log_trainval.shape",image_log_trainval.shape)
    print("pv_log_trainval.shape",pv_log_trainval.shape)
    
    print("times_test.shape",times_test.shape)
    print("image_log_test.shape",image_log_test.shape)
    print("pv_log_test.shape",pv_log_test.shape)
    
    #storing information
    # storing the training set
    np.save(os.path.join(pred_folder,'image_log_trainval.npy'), image_log_trainval)
    np.save(os.path.join(pred_folder,'pv_log_trainval.npy'), pv_log_trainval)
    np.save(os.path.join(pred_folder,'times_trainval.npy'),times_trainval)

    # storing the testing set
    np.save(os.path.join(pred_folder,'image_log_test.npy'), image_log_test)
    np.save(os.path.join(pred_folder,'pv_log_test.npy'), pv_log_test)
    np.save(os.path.join(pred_folder,'times_test.npy'),times_test)

In [69]:
store_trainval_test(all_times,all_images,pv_outputs,pred_folder)

times_trainval.shape (6492,)
image_log_trainval.shape (6492, 64, 64, 3)
pv_log_trainval.shape (6492,)
times_test.shape (671,)
image_log_test.shape (671, 64, 64, 3)
pv_log_test.shape (671,)
