## Multi class classification : preprocessing

In [1]:
### Here, I am preprocessing the images, i.e, convert them into the 3d spatial format
### to use later on in the other file for the further preprocessing
### I have around 5,000 images spreaded across 4 different categories

In [None]:
### update : I have to change the class labels from class_labels = {'cloudy' : 1, 'desert' : 2, 'green_area' : 3, 'water' : 4}
### to this  : class_labels = {'cloudy' : 0, 'desert' : 1, 'green_area' : 2, 'water' : 3}
### because the final model during training is expecting it to be like starting from 0

### loading the libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import tensorflow
from tensorflow.keras.preprocessing.image import load_img, img_to_array, array_to_img

### loading the images

In [2]:
basedir = '../data/raw'
target_size = (224, 224)
X, y = [], []
# -----------------old labels
# class_labels = {'cloudy' : 1, 'desert' : 2, 'green_area' : 3, 'water' : 4}
# -----------------new labels
class_labels = {'cloudy' : 0, 'desert' : 1, 'green_area' : 2, 'water' : 3}

In [3]:
for class_path in os.listdir(basedir) : 
    class_name = os.path.join(basedir, class_path)

    if not os.listdir(basedir) : 
        continue

    print(f"loading the images : {class_path}")

    for image_path in os.listdir(class_name) : 
        actual_image_path = os.path.join(class_name, image_path)

        try : 
            load_images = load_img(actual_image_path, target_size = target_size)
            load_images = img_to_array(load_images) / 255.0

            X.append(load_images)
            y.append(class_labels[class_path])

        except Exception as e : 
            print(f"error loading the image : {actual_image_path} : {e}")

loading the images : cloudy
loading the images : desert
loading the images : green_area
loading the images : water


In [4]:
X = np.array(X)
y = np.array(y)

In [5]:
print(f"{X.shape} : {y.shape}")

(5631, 224, 224, 3) : (5631,)


### verifying the labels in the y

In [6]:
df = pd.DataFrame(y)

In [7]:
df.value_counts()

0
0    1500
2    1500
3    1500
1    1131
Name: count, dtype: int64

### shuffling the data

In [8]:
from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state = 42)

In [9]:
print(f"{X.shape} : {y.shape}")

(5631, 224, 224, 3) : (5631,)


In [10]:
df = pd.DataFrame(y)

In [11]:
df.head()

Unnamed: 0,0
0,2
1,1
2,2
3,1
4,3


In [12]:
df.value_counts()

0
0    1500
2    1500
3    1500
1    1131
Name: count, dtype: int64

### spliting and saving the data

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [14]:
print(f"{X_train.shape} : {y_train.shape} :: {X_test.shape} : {y_test.shape}")

(4223, 224, 224, 3) : (4223,) :: (1408, 224, 224, 3) : (1408,)


In [15]:
X_train = np.save('../data/preprocessed/train/X_train.npy', X_train)
y_train = np.save('../data/preprocessed/train/y_train.npy', y_train)
X_test = np.save('../data/preprocessed/test/X_test.npy', X_test)
y_test = np.save('../data/preprocessed/test/y_test.npy', y_test)