In [1]:
import os
import os.path
import pickle
import shutil
import pandas as pd
import random
import math
import matplotlib.pyplot as plt
from os.path import join, exists
import numpy as np
from tqdm import tqdm
import random
import copy
%matplotlib inline

This notebook takes the prediction scores of HR model, LR model, and blur detection model (ood model) as inputs, and outputs the installation year prediction for each image sequence.

In [2]:
dir_list = ['demo_sequences']

In [3]:
root_data_dir = 'data/sequences'

# 1. Load prob dicts

In [4]:
with open('results/HR_prob_dict.pickle', 'rb') as f:
    HR_prob_dict = pickle.load(f)

In [5]:
with open('results/LR_prob_dict.pickle', 'rb') as f:
    LR_prob_dict = pickle.load(f)

In [6]:
with open('results/ood_prob_dict.pickle', 'rb') as f:
    ood_prob_dict = pickle.load(f)

# 2. Installation year detection

In [8]:
LR_threshold = 0.97
blur_threshold = 0.29
ood_threshold = 0.09

In [9]:
# Given LR prediction score, and the info of OOD (blur), return the year of installation.
# Using OOD prediction (multiclass: [whether it is ood, whether it is HR]).
# In this function, reference list (HR_prob > thres) is assumed to be the key values of LR_prob_dict.
def hybrid_model_5(LR_prob_dict, blur_info, LR_threshold=0.5, ood_threshold=0.5, blur_threshold=0.5):
    """
    LR_prob_dict: key1: anchor_filename, key2: target_filename, value: prob produced by LR model
    blur_info: key: filename, value: an array of two scores (OOD score and blur score)
    LR_threshold: to determine whether a LR image is positive or not.
    ood_threshold: to determint whether a image is out-of-distribution ("impossible to detect") or not.
    blur_threshold: to determine whether a image is HR or LR.
    """ 
    def is_anchor_candidate(f):
        """ Determine whether an image can be a candidate of the "positive anchor" based on its blur 
        score and OOD score. """
        if  blur_info[f][1] >= blur_threshold and blur_info[f][0] >= ood_threshold:
            return True  # HR
        else:
            return False  # LR or OOD
    
    # reference list: a list of image filenames with its HR prediction score >= HR_threshold
    reference_list = sorted(LR_prob_dict.keys()) # sorted in the time order
    
    # determine the "positive anchor"
    selected_anchors = [f for f in reference_list if is_anchor_candidate(f)]
    if selected_anchors:
        positive_anchor = selected_anchors[0] # use the earliest anchor image as the "positive anchor"
    else:
        positive_anchor = reference_list[-1]
    
    # determine the first target (LR) that surpass the threshold based on all referenced anchors
    for target in sorted(LR_prob_dict[positive_anchor].keys()): # go through all images
        if is_anchor_candidate(target): # skip those images with is HR
            continue
        if int(target.split('_')[0]) > int(positive_anchor.split('_')[0]): # skip those images later than positive anchor
            continue
        if blur_info[target][0] < ood_threshold: # don't consider OOD images but record them
            continue
        for ref in reference_list:
            if LR_prob_dict[ref][target] > LR_threshold:
                return max(min(2017, int(target.split('_')[0])), 2005), positive_anchor
        
    return max(min(2017, int(positive_anchor.split('_')[0])), 2005), positive_anchor

In [10]:
# To gather all "critial" years that are missing but may change the year prediction.
# "critical" means that it is RIGHT before the predicted installation year
def backtrack_missing_critical_years(LR_prob_dict, 
                                     blur_info, 
                                     positive_anchor, 
                                     installation_year,
                                     LR_threshold,
                                     ood_threshold,
                                     blur_threshold):
    """
    LR_prob_dict: key1: anchor_filename, key2: target_filename, value: prob produced by LR model
    blur_info: key: filename, value: an array of two scores (OOD score and blur score)
    ood_images: a list of image filenames which are identified as OOD and thus can be regarded as missing
    positive_anchor: the anchor image filename which is the earliest HR positive sample
    installation_year: the predicted year of installation
    LR_threshold: to determine whether a LR image is positive or not
    ood_threshold: to determint whether a image is out-of-distribution ("impossible to detect") or not.
    blur_threshold: to determine whether a image is HR or LR.
    """
    all_images = sorted(LR_prob_dict[positive_anchor].keys()) # all image filenames in that sequence in the time order
    
    # reference list: a list of image filenames with its HR prediction score >= HR_threshold
    reference_list = set(sorted(LR_prob_dict.keys())) # sorted in the time order
    
    all_downloaded_years = {} # Note: only consider those years no later than installation_year
    for f in all_images:
        year = int(f.split('_')[0])
        if blur_info[f][0] >= ood_threshold or f in reference_list:  # OOD images are regarded as missing
            if year not in all_downloaded_years:
                all_downloaded_years[year] = []
            all_downloaded_years[year].append(f)
            
    missing_critial_years = []
    # backtracking
    curr_year = installation_year - 1
    while curr_year >= 2005 and curr_year not in all_downloaded_years:
        missing_critial_years.append(curr_year)
        curr_year -= 1
    
    if not missing_critial_years:  # no missing
        return missing_critial_years
    
    if installation_year not in all_downloaded_years:  # it indicates that the actual predicted year is 2018 but restricted to 2017
        assert installation_year == 2017
        return missing_critial_years + [2017]
    
#     if len(all_downloaded_years[installation_year]) == 1:  # only one image in that year
#         return missing_critial_years
    
    for f in all_downloaded_years[installation_year]:  
        # if any one of the images in the installtion year is negative (HR negative and LR negative), 
        # then we can infer one sample is positive and another is negative in that year, 
        # thus the solar panel must be installed in that year
        # then there is no missing critical year
        if blur_info[f][1] >= blur_threshold and f not in reference_list:
            return []
        if blur_info[f][1] < blur_threshold and f not in reference_list and all([LR_prob_dict[x][f] < LR_threshold for x in reference_list]):
            return []
    
    return missing_critial_years # a list of missing critial years

In [None]:
installation_year_dict = {} # sequence idx -> predicted installation year
missing_years_dict = {} # sequence idx -> a list of missing critial years
for idx in tqdm(HR_prob_dict):
    LR_prob_dict_sub = LR_prob_dict[idx]
    blur_info = ood_prob_dict[idx]
    installation_year, positive_anchor = hybrid_model_5(LR_prob_dict_sub, blur_info, LR_threshold, 
                                                    ood_threshold, blur_threshold)
    missing_years = backtrack_missing_critical_years(LR_prob_dict_sub, blur_info, positive_anchor, 
                                                     installation_year,
                                                     LR_threshold, ood_threshold, blur_threshold)
    installation_year_dict[int(idx)] = installation_year
    if missing_years:
#         if not installation_year in missing_years:
#             missing_years_dict[int(idx)] = missing_years + [installation_year]
#         else:
        missing_years_dict[int(idx)] = missing_years
print(len(installation_year_dict))
print(len(missing_years_dict))

In [53]:
with open('results/installation_year_prediction_dict.pickle', 'wb') as f:
    pickle.dump(installation_year_dict, f) 
with open('results/missing_years_dict.pickle', 'wb') as f:
    pickle.dump(missing_years_dict, f)