In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
import os
import subprocess

from six import string_types

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy
from skimage import io
from scipy import ndimage
from IPython.display import display
%matplotlib inline

import tensorflow as tf
from tensorflow import keras


In [None]:
file1 = '/content/drive/MyDrive/Colab Notebooks/Understanding the Amazon from space - Kaggle/train_v2.csv/train_v2.csv'
train = pd.read_csv(file1)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
file2 = '/content/drive/MyDrive/Colab Notebooks/Understanding the Amazon from space - Kaggle/test_v2_file_mapping.csv/test_v2_file_mapping.csv'
test = pd.read_csv(file2)

In [None]:
test.head()

In [None]:
test.shape

In [None]:
# creating the weather labels
weather_categories = ['partly_cloudy', 'haze', 'cloudy', 'clear']
weather_tag_list = [[weather for weather in tag.split() if weather in weather_categories] for tag in train['tags']]
train['weather_tags'] = [''.join(tag) for tag in weather_tag_list]
train.head()

In [None]:
# Build list with unique labels
label_list = []
for tag_str in train.tags.values:
    labels = tag_str.split(' ')
    for label in labels:
        if label not in label_list:
            label_list.append(label)

            
# Display label list and length 
print(f'There are {len(train)} data samples, with {len(label_list)} possible classes.', '\n')
print(f'The label list includes: ')
labels_dict = dict(zip(range(0,17), label_list))
labels_dict

In [None]:
# One-hot encode the features
train_tag = train.copy()
for label in label_list:
    train_tag[label] = train_tag['tags'].apply(lambda x: 1 if label in x.split() else 0)
    
train_tag.head()

In [None]:
#print all unique tags
from itertools import chain
label_list = list(chain.from_iterable([tags.split(" ") for tags in train_tag['tags'].values]))
label_set = set(label_list)
print(f"There are {len(label_set)} unique labels", '\n')
print(f'These unique label sets are: ')
labels_set = dict(zip(range(0,17), label_set))
labels_set



In [None]:
#Histogram of label instances
tag_labels = pd.Series(label_list).value_counts()
fig, ax = plt.subplots(figsize=(16, 8))
sns.barplot(x=tag_labels, y=tag_labels.index, orient='h')

In [None]:
#function for cooocurence matrix plotting
def make_cooccurence_matrix(labels):
  num_data = train_tag[labels];
  c_matrix = num_data.T.dot(num_data)
  sns.heatmap(c_matrix)
  return c_matrix

#compute the cooccurence
make_cooccurence_matrix(label_set)

In [None]:
# plot weather element cooccurence matrix
weather_labels = ['clear', 'partly_cloudy', 'haze', 'cloudy']
make_cooccurence_matrix(weather_labels)

In [None]:
# plot land-use element classes cooccurence matrix
land_labels = ['primary', 'agriculture', 'water', 'cultivation', 'habitation']
make_cooccurence_matrix(land_labels)

In [None]:
# for the analysis we need columns after tag and image_name
train_tag_columns = list(train_tag.columns[2:])
print(train_tag_columns,end='')

In [None]:
#onehotencode the image name
train_tag['image_name'] = train_tag['image_name'].apply(lambda x:
                                                        f'{x}.jpg')
train_tag.head()

In [None]:
image_train_path = '/content/drive/MyDrive/Colab Notebooks/Understanding the Amazon from space - Kaggle/planet/train-jpg'

In [None]:
image_train_path

In [None]:
dir = tf.data.Dataset.list_files(image_train_path + '/*')

In [None]:
image_train_path = '/content/drive/MyDrive/Colab Notebooks/Understanding the Amazon from space - Kaggle/planet/train-jpg'

In [None]:
for filename in os.listdir(image_train_path):
  if filename.endswith(".jpg") or filename.endswith(".png"):
    img = plt.imread(os.path.join(image_train_path, filename))
    plt.imshow(img)
    plt.show()

Data pre-processing

In [None]:
#Determine if length of the train and test dataset csv file equals the actual number of images in the folder
import pathlib
#train path
train_image_dir = pathlib.Path(image_train_path)
train_img_path = sorted(list(train_image_dir.glob('*.jpg')))

#test path
test_img_dir = pathlib.Path('/content/drive/MyDrive/Colab Notebooks/Understanding the Amazon from space - Kaggle/planet/test-jpg')
test_img_path = sorted(list(test_img_dir.glob('*.jpg')))

#additional test path
test_add_img_dir = pathlib.Path('test-jpg-additional')
test_add_img_path = sorted(list(test_add_img_dir.glob('*/*.jpg')))

Image preprocessing

In [None]:
#define input size
input_size = 64

In [None]:
#create x_train and y_train
x_train = []
y_train = []

for f, tags in tqdm(train_tag.values, miniters=1000):
  img = cv2.imread('/content/drive/MyDrive/Colab Notebooks/Understanding the Amazon from space - Kaggle/planet/train-jpg/{}.jpg' .format(f))