## **Notebook Setup**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import random # for random value
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting 
from PIL import Image # Image processing
import json  #json file I/O
#from mpl_toolkits.basemap import Basemap

In [5]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation
from keras.layers import Dropout
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.optimizers import SGD
import keras.callbacks

In [16]:
import tifffile as tiff
import time 

In [6]:
from pathlib import Path
from skimage.io import imread
from skimage.color import rgb2gray

In [19]:
# set up too see all contents
# reference: https://stackoverflow.com/questions/60013721/how-to-see-complete-rows-in-google-colab
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


## **Prepare Data**

Unzip EuroSAT Images (ONLY NEED TO RUN ONCE)

In [None]:
# Unzip the EuroSAT images (27000)
# !unzip '/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSAT.zip' -d '/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg'

Import EuroSAT Images

In [None]:
BDIR = '/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg'
ac = Path(BDIR+r"/2750/AnnualCrop")
df1 = pd.DataFrame({'path': list(ac.glob('**/*.jp*g'))})
df1

In [17]:
# Set Up Base Directory
BDIR = '/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg'

print('[INFO] Loading EuroSAT Images ...')
start = time.time()

# Load Images From Directories of Each Category
ac = Path(BDIR + r"/2750/AnnualCrop")
df1 = pd.DataFrame({'path': list(ac.glob('**/*.jp*g'))})

fo = Path(BDIR + r"/2750/Forest")
df2 = pd.DataFrame({'path': list(fo.glob('**/*.jp*g'))})

hv = Path(BDIR + r"/2750/HerbaceousVegetation")
df3 = pd.DataFrame({'path': list(hv.glob('**/*.jp*g'))})

hw = Path(BDIR + r"/2750/Highway")
df4 = pd.DataFrame({'path': list(hw.glob('**/*.jp*g'))})

ind = Path(BDIR + r"/2750/Industrial")
df5 = pd.DataFrame({'path': list(ind.glob('**/*.jp*g'))})

pas = Path(BDIR + r"/2750/Pasture")
df6 = pd.DataFrame({'path': list(pas.glob('**/*.jp*g'))})

pc = Path(BDIR + r"/2750/PermanentCrop")
df7 = pd.DataFrame({'path': list(pc.glob('**/*.jp*g'))})

res = Path(BDIR + r"/2750/Residential")
df8 = pd.DataFrame({'path': list(res.glob('**/*.jp*g'))})

riv = Path(BDIR + r"/2750/River")
df9 = pd.DataFrame({'path': list(riv.glob('**/*.jp*g'))})

sl = Path(BDIR + r"/2750/SeaLake")
df10 = pd.DataFrame({'path': list(sl.glob('**/*.jp*g'))})

end = time.time()
print('[INFO] Loading EuroSAT Images took {} seconds'.format(end-start))

[INFO] Loading EuroSAT Images ...
[INFO] Loading EuroSAT Images took 0.6513419151306152 seconds


In [27]:
df1.shape

(3000, 1)

In [28]:
# Label Images

print('[INFO] Labeling EuroSAT Images ...')
start = time.time()

df1["category"] = "AnnualCrop"
df1["label"] = 0

df2["category"] = "Forest"
df2["label"] = 1

df3["category"] = "HerbaceousVegetation"
df3["label"] = 2

df4["category"] = "Highway"
df4["label"] = 3

df5["category"] = "Industrial"
df5["label"] = 4

df6["category"] = "Pasture"
df6["label"] = 5

df7["category"] = "PermanentCrop"
df7["label"] = 6

df8["category"] = "Residential"
df8["label"] = 7

df9["category"] = "River"
df9["label"] = 8

df10["category"] = "SeaLake"
df10["label"] = 9

end = time.time()
print('[INFO] Labeling EuroSAT Images took {} seconds'.format(end-start))

[INFO] Labeling EuroSAT Images ...
[INFO] Labeling EuroSAT Images took 0.009362220764160156 seconds


In [34]:
# Integrate the full dataset
EuroSAT = pd.DataFrame().append([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10])
EuroSAT.head()

Unnamed: 0,path,category,label
0,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_2053.jpg,AnnualCrop,0
1,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_300.jpg,AnnualCrop,0
2,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_40.jpg,AnnualCrop,0
3,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_1146.jpg,AnnualCrop,0
4,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_2432.jpg,AnnualCrop,0


In [39]:
df_Is_Forest = EuroSAT
df_Is_Forest['CAT']=0
SEL_LABEL = 'Forest'
df_Is_Forest.loc[df_Is_Forest.category==SEL_LABEL, 'CAT'] = 1
print('[FACT] The EuroSAT data contains {} {} images and {} Non-{} images.'.format(np.sum(df_Is_Forest.CAT==1), SEL_LABEL, np.sum(df_Is_Forest.CAT==0), SEL_LABEL))

[FACT] The EuroSAT data contains 3000 Forest images and 24000 Non-Forest images.


In [43]:
df_Is_Forest.head()

Unnamed: 0,path,category,label,CAT
0,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_2053.jpg,AnnualCrop,0,0
1,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_300.jpg,AnnualCrop,0,0
2,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_40.jpg,AnnualCrop,0,0
3,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_1146.jpg,AnnualCrop,0,0
4,/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/EuroSATImg/2750/AnnualCrop/AnnualCrop_2432.jpg,AnnualCrop,0,0


In [58]:
y_train

array([2, 6, 5, ..., 0, 5, 8])

In [63]:
X_data = df_Is_Forest["path"]
y_data = df_Is_Forest["CAT"].values

# Split the data into training (60%) and testing sets (40%)
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X_data, y_data, train_size=0.6, random_state=42)
print('[FACT] The training set contains {} images and the testing set contains {} images.'.format(X_train.shape[0], y_test.shape[0]))
print('[FACT] Percentage of {} images in training set: {}.'.format(SEL_LABEL, np.sum(y_train==1)/y_train.shape[0]))
print('[FACT] Percentage of {} images in testing set: {}. '.format(SEL_LABEL, np.sum(y_test==1)/y_test.shape[0]))

[FACT] The training set contains 16200 images and the testing set contains 10800 images.
[FACT] Percentage of Forest images in training set: 0.11135802469135803.
[FACT] Percentage of Forest images in testing set: 0.11074074074074074. 


Convert Data to Matrix Form

In [66]:
# Read all images in the training dataset and convert the data into matrix
print('[INFO] Converting data to matrix form ...')
start = time.time()

# get the shape of image
img1 = imread(df1.path[0])
img1_gray = rgb2gray(img1)
img1_vec = img1_gray.flatten()

dvec = img1_vec.shape[0]

n = X_train.shape[0]

mat_1D_train = np.zeros([n, dvec])

# loop
for i, tmpPath in enumerate(X_train.values):
    img_rgb_All = imread(tmpPath)
    img_gray_All = rgb2gray(img_rgb_All)
    img_vec_All = img_gray_All.flatten()
    mat_1D_train[i, :] = img_vec_All

# Read all images in the testing dataset and convert the data into matrix

# get the shape of image
img1 = imread(df1.path[0])
img1_gray = rgb2gray(img1)
img1_vec = img1_gray.flatten()

dvec = img1_vec.shape[0]

n = X_test.shape[0]

mat_1D_test = np.zeros([n, dvec])

# loop
for i, tmpPath in enumerate(X_test.values):
    img_rgb_All = imread(tmpPath)
    img_gray_All = rgb2gray(img_rgb_All)
    img_vec_All = img_gray_All.flatten()
    mat_1D_test[i, :] = img_vec_All

end = time.time()
print('[INFO] Converting data to matrix form took {} seconds'.format(end-start))

[INFO] Converting data to matrix form ...
[INFO] Converting data to matrix form took 63.05270791053772 seconds


In [68]:
mat_1D_train.shape

(16200, 4096)

In [67]:
mat_1D_test.shape

(10800, 4096)

SVC

In [73]:
from sklearn.svm import SVC
svc=SVC()

print('[INFO] SVC Training ...')
start = time.time()

# train the model
svc.fit(mat_1D_train, y_train)

end = time.time()
print('[INFO] SVC Training took {} seconds'.format(end-start))

[INFO] SVC Training ...
[INFO] SVC Training took 308.3952388763428 seconds


In [75]:
# testing accuracy

print('[INFO] SVC Testing ...')
start = time.time()

y_pred_test = svc.predict(mat_1D_test)

from sklearn.metrics import accuracy_score
print ('Testing accuracy =', accuracy_score(y_test, y_pred_test))

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test))

end = time.time()
print('[INFO] SVC Testing took {} seconds'.format(end-start))

[INFO] SVC Testing ...
Testing accuracy = 0.9407407407407408
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      9604
           1       0.69      0.84      0.76      1196

    accuracy                           0.94     10800
   macro avg       0.84      0.90      0.86     10800
weighted avg       0.95      0.94      0.94     10800

[INFO] SVC Testing took 190.18878626823425 seconds


CNN

In [88]:
# read images without vectorize them
num = X_train.shape[0]
dim_train = np.zeros([num, 64, 64, 3])

# loop
for i, tmpPath in enumerate(X_train.values):
    img_rgb_train = imread(tmpPath)
    dim_train[i, :, :, :] = img_rgb_train

In [89]:
dim_train.shape

(16200, 64, 64, 3)

In [90]:
#read images without vectorize them
num = X_test.shape[0]
dim_test = np.zeros([num, 64, 64, 3])

#loop
for i, tmpPath in enumerate(X_test.values):
    img_rgb_train = imread(tmpPath)
    dim_test[i, :, :, :] = img_rgb_train

In [91]:
dim_test.shape

(10800, 64, 64, 3)

In [92]:
#change labels to categorical
from keras.utils import to_categorical

y_train_labels = to_categorical(y_train)
y_test_labels = to_categorical(y_test)

In [93]:
y_train_labels

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [97]:
#Simple CNN

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(64, 64, 3)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 62, 62, 32)        896       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 60, 60, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 30, 64)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 30, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 57600)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               7372928   
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)              

In [98]:
print('[INFO] Model Compiling ...')
start = time.time()

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

end = time.time()
print('[INFO] Model Compiling took {} seconds'.format(end-start))

[INFO] Model Compiling ...
[INFO] Model Compiling took 0.009467840194702148 seconds


In [99]:
print('[INFO] Model Training ...')
start = time.time()

model.fit(dim_train, y_train_labels,
          batch_size=128,
          epochs=12,
          verbose=1,
          validation_data=(dim_test, y_test_labels))

end = time.time()
print('[INFO] Model Training took {} seconds'.format(end-start))

[INFO] Model Training ...
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
[INFO] Model Training took 2427.4599165916443 seconds


In [106]:
# Save the entire model as a SavedModel.
!pip install pyyaml h5py  # Required to save models in HDF5 format

model.save(r'/content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/saved_model/EuroSAT_cnn_1') 

INFO:tensorflow:Assets written to: /content/drive/My Drive/MUSA650-Spring2021/MUSA650-Final-Project/saved_model/EuroSAT_cnn_1/assets
