# Process Images

## Import Packages

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split


from PIL import Image, ImageDraw
from sklearn.model_selection import train_test_split

import pickle
import geopandas as gp

import matplotlib.pyplot as plt

img_width, img_height = 5184, 3888

IMAGE_DIRECTORY = "StreetviewImages/"

## Process files

Load pickle files

In [44]:
with open("prev/confidence_data.pickle", "rb") as f:
    data = np.array(pickle.load(f))
full_dataset = dict((data[i][0], [data[i][3], data[i][4]]) for i in range(data.shape[0]))
mask_map = data[:, 1]

Geotagging Function

In [45]:
def geotag(path, mask_path, min_height = 5400, center_only = False):
    image = Image.open(path)
    image = np.array(image)

    # Geotagging
    mask = np.zeros([img_height, img_width])
    df = gp.read_file(mask_path)
    for poly in df['geometry']:
        x, y = poly.exterior.coords.xy
        polygon = [(x, y) for x, y in zip(x, y)]
        img = Image.new('L', (img_width, img_height), 0)
        ImageDraw.Draw(img).polygon(polygon, outline=1, fill=1)

        poly_mask = np.array(img)
        poly_mask = np.reshape(poly_mask, mask.shape)

        mask = np.logical_or(mask, poly_mask)


    mask = mask.astype(np.float)

    if center_only:
        get the image part only, discard the black part
        start_row = -1
        end_row = mask.shape[0]
        for i in range(0, mask.shape[0]):
            if mask[i].any() > 0 and start_row == -1:
                start_row = i
            elif mask[i].any() == 0 and start_row != -1:
                end_row = i + 1
                break

    for i in range(0, mask.shape[0]):
        if mask[i].any() != 0:
            continue
        else:
            image[i, :, :] = 0

    if center_only:
        if end_row - start_row < min_height:
            min_height = end_row - start_row
        image = image[start_row:end_row, :, :]
        mask = np.stack([mask, mask, mask], axis=2)
        new_img = image * mask

    image = Image.fromarray(image)
    return image

## Process Images

Read images from the directory

In [46]:
data_size = 0
corn_data_size = 0
soybean_data_size = 0
other_data_size = 0

data_distribution = [0, 0, 0, 0]
corn_data_distribution = [0, 0, 0, 0]
soybean_data_distribution = [0, 0, 0, 0]
other_data_distribution = [0, 0, 0, 0]

corn_data = {}
soybean_data = {}
other_data = {}

for folders in os.listdir(IMAGE_DIRECTORY):
    for files in os.listdir(IMAGE_DIRECTORY + folders):
        path = IMAGE_DIRECTORY + folders + "/" + files

        mask_path = "StreetviewBoundaries/" + folders + "/" + files + ".geojson"
        if os.path.exists(path) and os.path.exists(mask_path) and path in full_dataset.keys() and mask_path in mask_map:
            data_size += 1

            info = full_dataset[path]
            info.append(mask_path)
            info.append(files)
            data_distribution[info[1]] += 1

            if info[0] == 5:
                soybean_data_size += 1
                soybean_data_distribution[info[1]] += 1
                soybean_data[path] = info
            elif info[0] == 1:
                corn_data_size += 1
                corn_data_distribution[info[1]] += 1
                corn_data[path] = info
            else:
                other_data_size += 1
                other_data_distribution[info[1]] += 1
                other_data[path] = info


print("Dataset size: ", data_size)
print("Dataset distribution: ", data_distribution)
data_distribution.append(data_size)

print("\nCorn dataset size: ", corn_data_size)
print("Corn dataset distribution: ", corn_data_distribution)
corn_data_distribution.append(corn_data_size)

print("\nSoybean dataset size: ", soybean_data_size)
print("Soybean dataset distribution: ", soybean_data_distribution)
soybean_data_distribution.append(soybean_data_size)

print("\nOther dataset size: ", other_data_size)
print("Other dataset distribution: ", other_data_distribution)
other_data_distribution.append(other_data_size)

print()
data_table = np.array([data_distribution, corn_data_distribution, soybean_data_distribution, other_data_distribution])

dtf = pd.DataFrame(data_table)
dtf.index = ["Full dataset", "Corn Dataset", "Soybean Dataset", "Other Dataset"]
dtf.columns = ["High tillage", "Low tillage", "No tillage", "Grass", "Datasize"]
print(dtf)

Dataset size:  908
Dataset distribution:  [291, 329, 111, 177]

Corn dataset size:  395
Corn dataset distribution:  [78, 199, 73, 45]

Soybean dataset size:  346
Soybean dataset distribution:  [171, 84, 17, 74]

Other dataset size:  167
Other dataset distribution:  [42, 46, 21, 58]

                 High tillage  Low tillage  No tillage  Grass  Datasize
Full dataset              291          329         111    177       908
Corn Dataset               78          199          73     45       395
Soybean Dataset           171           84          17     74       346
Other Dataset              42           46          21     58       167


### Separate Crop Types

#### Corn

In [47]:
corn_key = list(corn_data)
corn_train, corn_other = train_test_split(corn_key, test_size=0.3)
corn_val, corn_test = train_test_split(corn_other, test_size = 0.5)
corn_dataset = {
    "train": corn_train,
    "val": corn_val,
    "test": corn_test
}

#### Soybean

In [48]:
soybean_key = list(soybean_data)
soybean_train, soybean_other = train_test_split(soybean_key, test_size=0.3)
soybean_val, soybean_test = train_test_split(soybean_other, test_size = 0.5)
soybean_dataset = {
    "train": soybean_train,
    "val": soybean_val,
    "test": soybean_test
}

In [49]:
def store_image(image_dir, geotagging_dir, phase, path, info):
    crop_type, tillage_type, mask_path, file_name = info

    image_dir_phase = image_dir + "/" + phase
    geotagging_dir_phase = geotagging_dir + "/" + phase

    if not os.path.exists(image_dir_phase):
        os.mkdir(image_dir_phase)
    
    if not os.path.exists(geotagging_dir_phase):
        os.mkdir(geotagging_dir_phase)

    tillage_dict = {
        0: "/high_tillage/",
        1: "/low_tillage/",
        2: "/no_tillage/",
        3: "/grass/"
    }

    image_dir_phase += tillage_dict[tillage_type]
    geotagging_dir_phase += tillage_dict[tillage_type]
    
    if not os.path.exists(geotagging_dir_phase):
        os.mkdir(geotagging_dir_phase)
    
    image_dir_phase += file_name
    geotagging_dir_phase += file_name

    image = Image.open(path)
    geotagging_image = geotag(path, mask_path)

    image.save(image_dir_phase)
    geotagging_image.save(geotagging_dir_phase) 


In [27]:
def processImages(main_dir, type, crop_dataset, crop_data):
    '''
    Stores images by train, val, test, and crop type
    '''

    dir = main_dir + "_image/"
    if not os.path.exists(dir):
        os.mkdir(dir)

    image_dir = main_dir + "_image/" + type + "_dataset"
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)
    else:
        print("Directory already exists")
        return

    for phase in ["train", "val", "test"]:
        dataset = crop_dataset[phase]
        for path in dataset:
            info = crop_data[path]
            crop_type, tillage_type, mask_path, file_name = info

            image_dir_phase = image_dir + "/" + phase

            if not os.path.exists(image_dir_phase):
                os.mkdir(image_dir_phase)
            
            if info[1] == 0:
                image_dir_phase += "/high_tillage/"
            elif info[1] == 1:
                image_dir_phase += "/low_tillage/"
            elif info[1] == 2:
                image_dir_phase += "/no_tillage/"
            elif info[1] == 3:
                image_dir_phase += "/grass/"

            if not os.path.exists(image_dir_phase):
                os.mkdir(image_dir_phase)
            
            
            image_dir_phase += file_name

            image = Image.open(path)
            # geotagging_image = geotag(path, mask_path)

            image.save(image_dir_phase)

In [51]:
processImages("TARGET_DIR", "corn", corn_dataset, corn_data)
processImages("TARGET_DIR", "soybean", corn_dataset, corn_data)