### The set up

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
import os
import time

save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())

#makes the output nicer

from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 44em; }</style>"))




### Initating our dataset

Here we'll load our base data set, and do some necessary transformations to get the data we want in the way we want it

In [None]:
#Loading the dataset initially
df = pd.read_excel('Global Solar Power Tracker - main V1.xlsx', 'Europe')


df = df.loc[(df['Location accuracy']=='exact') & (df['Technology Type'] != 'Solar Thermal')&(df['Status']=='operating')]
df = df[['Country', 'Project Name', 'Capacity (MW)', 'Latitude', 'Longitude']].reset_index(drop=True)
df['Capacity (MW)'] = pd.to_numeric(df['Capacity (MW)'])

#Smaller plants are harder to identify-- let's start with a 5MW threshold
df_10MW = df.loc[df['Capacity (MW)']>10].reset_index(drop=True)
df_5MW = df.loc[df['Capacity (MW)']>5].reset_index(drop=True)
df_5MW


Let's take a look at what our dataset looks like now

In [None]:
df_5MW

Here we're just creating some subsets of the data to make the labelling task a little easier

In [None]:
subsetdf = df_5MW.reset_index().values
batch1 = subsetdf[0:300]
batch2 = subsetdf[301:600]
batch3 = subsetdf[601:900]
batch4 = subsetdf[901:1200]
batch5 = subsetdf[1201:1500]
batch6 = subsetdf[1501:]


Since we won't be doing all the labelling in one go, we want a way to keep track of which plants we've already labelled. We do that later on by creating a document that keeps track of whats been labelled. We'll load that document here and do some manipulation to make it easier to work with

In [None]:
#imagess = 'ImageFetching_2023-05-24.xlsx'
#imagess = 'ImageFetching_2023-05-26_1029.xlsx'
#imagess = 'ImageFetching_2023-05-29_1857.xlsx'
imagess = 'ImageFetching_2023-05-31_1417.xlsx'
previmages_df = pd.read_excel(imagess, index_col=0)
previmages = previmages_df[['Result']].reset_index().values.tolist()
previmages = [tuple(l) for l in previmages]
previmages_dict=dict(previmages)


## Initating the labeling process

We're using BingMaps to pull aerial images of all the solar fields in our dataset.

This for loop will:
1. Skip any rows of the dataframe that have already been labelled
2. Take the lat/lon coordinates of each solar plant in our dataframe subset and pass them into the BingMaps API to retrieve an aerial image of these coordinates
3. Show the resulting image and prompt the user to label it as having a solar field or not
4. Save the image to our local directory as well as to that file keeping track of how we're labelling each image

In [None]:


save_directory = "BingMaps Test Images"
plt.ion()

images = {}
for i,country, projname, cap, lat, lon in batch4:
    if i in previmages_df.index:
        pass
    else:
        save_directory = "BingMaps Test Images"
        api_key = 'AmsQbXtxrmBgb0YK_x1LDIGMdCtq9YgTLfiKn76-I-QU2yGZneThmpzh4MTHT32x'
        zoom_level = 18
        map_width = 600
        map_height = 600

        #lat = df.loc[0]['Latitude']
        #lon = df.loc[0]['Longitude']

        api_url = f'https://dev.virtualearth.net/REST/v1/Imagery/Map/Aerial/{lat},{lon}/{zoom_level}?mapsize={map_width},{map_height}&key={api_key}'

        response = requests.get(api_url)

        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            #images[i] = image

            plt.imshow(image)
            plt.show()
            #plt.pause(0.01)

            user_input = input('Is there a solar field in this image? (Y/N):')
            if user_input.upper() == 'Y':
                filename = f"positive_{country}_{projname}.jpg"
                image.save(os.path.join(save_directory, filename))
                images[i] = filename

            elif user_input.upper()=='N':
                images[i] = 'No solar field found'
                filename = f"negative_{country}_{projname}.jpg"
                image.save(os.path.join(save_directory, filename))
                images[i] = filename

            #If the image isn't totally clear as yes or no, user can in put "eh" to manually double check and decide later on
            elif user_input.upper()=='EH':
                filename = f"DOUBLECHECK_{country}_{projname}.jpg"
                image.save(os.path.join(save_directory, filename))
                images[i] = filename
            #This is useful as there are sometimes duplicates in the dataframe, or if for whatever reason we don't want to 
                #use a certain image
            elif user_input.upper()=="SKIP":
                images[i] = 'Skipped'
                
            #This is useful if we want to stop and take a break before the batch runs through. 
                #(This part is pretty boring, after all.)
            elif user_input.upper()=='DONE':
                images = {**previmages_dict, **images}
                imagesdf = pd.DataFrame.from_dict(images, orient='index', columns=['Result'])
                imagesdf_merged=pd.merge(imagesdf, df_5MW, left_index=True, right_index=True)
                imagesdf_merged.to_excel(f'ImageFetching_{save_timestamp}.xlsx')
                print(f'Ended on {projname} in {country}')
                break
            else:
                break

            #plt.close()
        else:
            images[i] = 'Failed to retrieve image'
            print('Failed to retrieve image')
            

        


In [None]:
##add the newly labelled images to the dictionary containing the previously labelled images
images = {**previmages_dict, **images}
imagesdf = pd.DataFrame.from_dict(images, orient='index', columns=['Result'])
imagesdf_merged=pd.merge(imagesdf, df_5MW, left_index=True, right_index=True)

#export to excel
imagesdf_merged.to_excel(f'ImageFetching_{save_timestamp}.xlsx')

### Extra data, if needed

If we need more negative samples, we can repeat the process except with a susbet of the original dataframe where the lat/lon have been already marked as <i> not </i> containing solar fields. This way we can get some random landscape images to serve as negative samples in our training dataset.

In [None]:

#Read and manipulate the original dataset to pull entries without exact location coordinates
df_neg = pd.read_excel('Global Solar Power Tracker - main V1.xlsx', 'Europe')
df_neg = df_neg.loc[(df_neg['Location accuracy']=='approximate') & (df_neg['Technology Type'] != 'Solar Thermal')&(df_neg['Status']=='operating')]
df_neg = df_neg[['Country', 'Project Name', 'Capacity (MW)', 'Latitude', 'Longitude']].reset_index(drop=True)
df_neg['Capacity (MW)'] = pd.to_numeric(df_neg['Capacity (MW)'])

df_neg_5MW = df_neg.loc[df_neg['Capacity (MW)']>5].reset_index(drop=True)
df_neg_5MW = df_neg_5MW.reset_index().values


In [None]:
#Repeat the labelling process with the negative dataframe
save_directory = "BingMaps Test Images"
plt.ion()

neg_images = {}
for i,country, projname, cap, lat, lon in df_neg_5MW:
    save_directory = "BingMaps Test Images"
    api_key = 'AmsQbXtxrmBgb0YK_x1LDIGMdCtq9YgTLfiKn76-I-QU2yGZneThmpzh4MTHT32x'
    zoom_level = 18
    map_width = 600
    map_height = 600

    #lat = df.loc[0]['Latitude']
    #lon = df.loc[0]['Longitude']

    api_url = f'https://dev.virtualearth.net/REST/v1/Imagery/Map/Aerial/{lat},{lon}/{zoom_level}?mapsize={map_width},{map_height}&key={api_key}'

    response = requests.get(api_url)

    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        #images[i] = image

        plt.imshow(image)
        plt.show()
        #plt.pause(0.01)

        user_input = input('Is there a solar field in this image? (Y/N):')
        if user_input.upper() == 'Y':
            filename = f"positive_{country}_{projname}.jpg"
            image.save(os.path.join(save_directory, filename))
            images[i] = filename

        elif user_input.upper()=='N':
            images[i] = 'No solar field found'
            filename = f"negative_{country}_{projname}.jpg"
            image.save(os.path.join(save_directory, filename))
            images[i] = filename

        elif user_input.upper()=='EH':
            filename = f"DOUBLECHECK_{country}_{projname}.jpg"
            image.save(os.path.join(save_directory, filename))
            images[i] = filename
        elif user_input.upper()=="SKIP":
            images[i] = 'Skipped'
        elif user_input.upper()=='DONE':
            #images = {**previmages_dict, **images}
            imagesdf = pd.DataFrame.from_dict(images, orient='index', columns=['Result'])
            imagesdf_merged=pd.merge(imagesdf, df_5MW, left_index=True, right_index=True)
            imagesdf_merged.to_excel(f'NEGATIVES_ImageFetching_{save_timestamp}.xlsx')
            print(f'Ended on {projname} in {country}')
            break
        else:
            break

        #plt.close()
    else:
        images[i] = 'Failed to retrieve image'
        print('Failed to retrieve image')
