<a href="https://colab.research.google.com/github/utukJ/Graded_quiz/blob/master/stage_D_kaggle_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!pip uninstall -y kaggle
!pip install --upgrade pip
!pip install kaggle==1.5.6
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d nikitarom/planets-dataset
!mkdir planets_dataset
!unzip planets-dataset -d planets_dataset

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [None]:
# importing relevant libraries

import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow import keras
from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, BatchNormalization, Dropout
from keras.optimizers import Adam, RMSprop
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import cv2
from PIL import Image

In [None]:
## obtaining csv files and setting up label mapping to integers

train_classes = pd.read_csv("/content/planets_dataset/planet/planet/train_classes.csv")
sample_sub = pd.read_csv("/content/planets_dataset/planet/planet/sample_submission.csv")

label_map = {'agriculture': 14,
 'artisinal_mine': 5,
 'bare_ground': 1,
 'blooming': 3,
 'blow_down': 0,
 'clear': 10,
 'cloudy': 16,
 'conventional_mine': 2,
 'cultivation': 4,
 'habitation': 9,
 'haze': 6,
 'partly_cloudy': 13,
 'primary': 7,
 'road': 11,
 'selective_logging': 12,
 'slash_burn': 8,
 'water': 15}


train_classes.head()

In [None]:
# creating modified dataframe to implement "flow_from_dataframe" image processing

train_dir = "/content/planets_dataset/planet/planet/train-jpg/"

modified_df = train_classes.copy()
modified_df["image_path"] = train_dir + modified_df["image_name"] + ".jpg"

# Add onehot features for every label
for label in label_map.keys():
    modified_df[label] = modified_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)
# Display head
modified_df.head()


# creating a data pipeline to feed training algorithm from dataframe

img_size = 128

data_gen = ImageDataGenerator(rescale = 1./255, 
                              validation_split = 0.2,
                              rotation_range = 180,
                              width_shift_range = 20,
                              height_shift_range = 20,
                              horizontal_flip = True,
                              vertical_flip = True,
                              fill_mode = "reflect")

valid_data_gen = ImageDataGenerator(rescale = 1./255,
                                    validation_split = 0.2)

train_data_eff = data_gen.flow_from_dataframe(dataframe=modified_df,
                                              x_col = "image_path",
                                              y_col = label_map.keys(),
                                              class_mode = "raw",
                                              shuffle = True,
                                              seed = 231,
                                              target_size = (img_size, img_size),
                                              subset = "training")

valid_data_eff = valid_data_gen.flow_from_dataframe(dataframe=modified_df,
                                              x_col = "image_path",
                                              y_col = label_map.keys(),
                                              class_mode = "raw",
                                              shuffle = True,
                                              seed = 231,
                                              target_size = (img_size, img_size),
                                              subset = "validation")


In [None]:
# visualizing some images and their transformations after augmentation


def to_tag(label_arr):
  label_list = list(label_arr[0])
  dict_keys = list(label_map.keys())
  dict_vals = label_map.values()
  tags = [dict_keys[i] for i in range(17) if label_list[i] == 1]
  return " ".join(tags)


for i in range(5):

  image, label = train_data_aug.next()
  p_image, p_label = train_data.next()
  print("label: ", to_tag(label))
  image, p_image = image.reshape((64, 64, 3)), p_image.reshape((64, 64, 3))

  fig = plt.figure(figsize = (10, 20))
  ax = fig.add_subplot(5, 2, i*2 + 1)
  ax.set_title("Augmented ")
  plt.imshow(image)
  ax = fig.add_subplot(5, 2, i*2 + 2)
  ax.set_title("Not augmented ")
  plt.imshow(p_image)

In [None]:
# function for fbeta evaluation metric

import numpy as np
from keras import backend as K


def fbeta(y_true, y_pred, threshold_shift=0):
    beta = 2

    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())

In [None]:

def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
    #credits https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/32475
  def mf(x):
    p2 = np.zeros_like(p)
    for i in range(17):
      p2[:, i] = (p[:, i] > x[i]).astype(np.int)
    score = fbeta_score(y, p2, beta=2, average='samples')
    return score

  x = [0.2]*17
  for i in range(17):
    best_i2 = 0
    best_score = 0
    for i2 in range(resolution):
      i2 /= resolution
      x[i] = i2
      score = mf(x)
      if score > best_score:
        best_i2 = i2
        best_score = score
    x[i] = best_i2
    if verbose:
      print(i, best_i2, best_score)

  return x

In [None]:
## vgg based model

from keras.applications import VGG19


vgg_net = VGG19(include_top = False,
                weights = 'imagenet',
                input_shape = (img_size, img_size, 3))

model = keras.Sequential()
model.add(vgg_net)
model.add(Flatten())  
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.25))
model.add(Dense(17, activation = "sigmoid"))

model_name = "nnet_vggnet"

In [None]:
## xception based model

from keras.applications import Xception


xception_model = Xception(include_top=False,
                          weights="imagenet",
                          input_shape=(img_size, img_size, 3))


model = tf.keras.Sequential()
model.add(xception_model)
model.add(Flatten())
model.add(Dense(1024, "relu"))
model.add(Dropout(0.25))
model.add(Dense(17, activation = "sigmoid"))

model_name = "nnet_xception"


In [None]:
# Resnet50 based model 

from keras.applications.resnet50 import ResNet50

res_net_model = ResNet50(input_shape = (img_size, img_size, 3),
                   include_top = False,
                   weights = "imagenet")


model = keras.Sequential()
model.add(res_net_model)
model.add(Flatten())
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.25))
model.add(Dense(17, activation = "sigmoid"))

model_name = "nnet_resnet50"

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
## compiling model and training

from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

callbacks = [EarlyStopping(monitor='val_fbeta',
                           mode = "max",
                            patience=5,
                            verbose=1,
                            min_delta=1e-4),
              ReduceLROnPlateau(monitor='val_fbeta',
                                factor=0.1,
                                patience=2,
                                cooldown=2,
                                mode = "max",
                                verbose=1),
             ModelCheckpoint(filepath = "/content/gdrive/My Drive/weights/{}_huge.hdf5".format(model_name),
                             monitor='val_fbeta',
                             mode = "max",
                             save_weights_only = True,
                             save_best_only = True,
                             verbose = 1)]



model.compile(loss = "binary_crossentropy", optimizer = Adam(1e-4), metrics = [fbeta])
model.fit(train_data_eff, validation_data = valid_data_eff, epochs = 50)

In [None]:
 ## evaluating trained model

model.evaluate(valid_data_eff)
model.evaluate(train_data_eff)




[0.07261151820421219, 0.9013378024101257]

In [None]:

y_val = np.zeros((1, 17))

for i in range(253):
  _, label = valid_data_eff.next()
  y_val = np.vstack((y_val, label))


y_val = y_val[1:, :]

p_val = model.predict(valid_data_eff)

print("p val shape: ", p_val.shape)
print("y val shape: ", y_val.shape)

# print("Evaluation after test time augmentation: ", fbeta_score(y_val, p_val, beta = 2, mode = ))
  

p val shape:  (8095, 17)
y val shape:  (8095, 17)


In [None]:
thresholds = optimise_f2_thresholds(y_val, p_val)

0 0.21 0.9194155385903352
1 0.1 0.9194926079993877
2 0.14 0.919612357917968
3 0.23 0.9196126921523974
4 0.06 0.9196889455357464
5 0.3 0.9199276463850321
6 0.02 0.9225129460045101
7 0.24 0.9225169830321274
8 0.26 0.9226441984780803
9 0.18 0.9227042721893643
10 0.14 0.9234138060283977
11 0.15 0.9235607866411317
12 0.44 0.9242074009261954
13 0.33 0.9245146599404346
14 0.2 0.9245146599404346
15 0.16 0.9245454511284396
16 0.15 0.9247787857396048


In [None]:
# creating modified dataframe to implement "flow_from_dataframe" image processing

test_dir = "/content/planets_dataset/planet/planet/test-jpg/"
additional_dir = "/content/planets_dataset/test-jpg-additional/test-jpg-additional/"

num_test = len(os.listdir(test_dir))

test_df = sample_sub.copy().iloc[:num_test]
add_df = sample_sub.copy().iloc[num_test:]
test_df["image_dir"] = test_dir + test_df.image_name + ".jpg"
add_df["image_dir"] = additional_dir + add_df.image_name + ".jpg"

modified_test_df = pd.concat((test_df, add_df))
modified_test_df


Unnamed: 0,image_name,tags,image_dir
0,test_0,primary clear agriculture road water,/content/planets_dataset/planet/planet/test-jp...
1,test_1,primary clear agriculture road water,/content/planets_dataset/planet/planet/test-jp...
2,test_2,primary clear agriculture road water,/content/planets_dataset/planet/planet/test-jp...
3,test_3,primary clear agriculture road water,/content/planets_dataset/planet/planet/test-jp...
4,test_4,primary clear agriculture road water,/content/planets_dataset/planet/planet/test-jp...
...,...,...,...
61186,file_9995,primary clear agriculture road water,/content/planets_dataset/test-jpg-additional/t...
61187,file_9996,primary clear agriculture road water,/content/planets_dataset/test-jpg-additional/t...
61188,file_9997,primary clear agriculture road water,/content/planets_dataset/test-jpg-additional/t...
61189,file_9998,primary clear agriculture road water,/content/planets_dataset/test-jpg-additional/t...


In [None]:
# creating a data pipeline for the test data

test_datagen = ImageDataGenerator(rescale = 1./255)

test_data = test_datagen.flow_from_dataframe(modified_test_df, x_col = "image_dir", batch_size = 32, shuffle = False, target_size = (img_size, img_size), class_mode = None)


Found 61191 validated image filenames.
Found 61191 validated image filenames.


In [None]:
# making predictions on test data

predictions = np.array(model.predict(test_data, verbose = 1) > thresholds , dtype = int)




In [None]:
 # converting output to label tags for submission.csv file

mySubmission = sample_sub.copy()
sorted_labels = list(label_map.keys())

for i in tqdm(range(predictions.shape[0])):
  tag = ""
  x = predictions[i]
  for lbl in sorted_labels:
    if x[sorted_labels.index(lbl)] == 1:
      tag += " " + lbl
  mySubmission["tags"][i] = tag[1:]


# saving the submission to a csv file

mySubmission.to_csv("submission.csv", index = False)



  0%|          | 0/61191 [00:00<?, ?it/s][A
  1%|          | 357/61191 [00:00<00:17, 3561.99it/s][A
  1%|          | 685/61191 [00:00<00:17, 3470.07it/s][A
  2%|▏         | 989/61191 [00:00<00:18, 3328.17it/s][A
  2%|▏         | 1347/61191 [00:00<00:17, 3399.65it/s][A
  3%|▎         | 1670/61191 [00:00<00:17, 3346.72it/s][A
  3%|▎         | 1988/61191 [00:00<00:17, 3293.93it/s][A
  4%|▍         | 2342/61191 [00:00<00:17, 3362.98it/s][A
  4%|▍         | 2660/61191 [00:00<00:17, 3302.37it/s][A
  5%|▍         | 3005/61191 [00:00<00:17, 3343.60it/s][A
  5%|▌         | 3326/61191 [00:01<00:18, 3181.89it/s][A
  6%|▌         | 3637/61191 [00:01<00:18, 3129.89it/s][A
  6%|▋         | 3969/61191 [00:01<00:17, 3182.48it/s][A
  7%|▋         | 4284/61191 [00:01<00:18, 3080.97it/s][A
  8%|▊         | 4591/61191 [00:01<00:18, 3072.57it/s][A
  8%|▊         | 4934/61191 [00:01<00:17, 3170.48it/s][A
  9%|▊         | 5279/61191 [00:01<00:17, 3247.28it/s][A
  9%|▉         | 5605/61191 [

In [None]:
mySubmission

Unnamed: 0,image_name,tags
0,test_0,clear primary
1,test_1,clear primary
2,test_2,partly_cloudy primary
3,test_3,agriculture clear cultivation primary
4,test_4,cloudy partly_cloudy primary
...,...,...
61186,file_9995,cloudy
61187,file_9996,clear primary water
61188,file_9997,clear primary road water
61189,file_9998,cloudy


In [None]:
# making submission on kaggle

!kaggle competitions submit -c planet-understanding-the-amazon-from-space -f submission.csv -m "Message"

100% 2.26M/2.26M [00:01<00:00, 1.75MB/s]
Successfully submitted to Planet: Understanding the Amazon from Space