In [26]:
import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import pandas as pd
import seaborn as sns
import matplotlib
from sklearn.model_selection import cross_val_score

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [27]:
tqdm.pandas()

In [8]:
# Function for reading data from Google Drive folders and returning a df
def read_data(folders): 
    X = []
    for folder in folders:
        files_s = glob.glob(f"../raw_data/{folder}/*all*.tif")
        for file in files_s:
            filename = os.path.basename(f"../raw_data/{folder}/{file}")
            file_id, date = filename.split('_all_bands_')
            date = date.replace('.tif','')
            d = {
                'date': date,
                'id': file_id,
                'filename': filename,
                'label': folder
            }
            X.append(d)
    return pd.DataFrame(X)



In [14]:
df = read_data(['rice_1y', 'sugarcane'])
df

Unnamed: 0,date,id,filename,label
0,2020-07-29,id_6094bb4df85c5e001b870910,id_6094bb4df85c5e001b870910_all_bands_2020-07-...,rice_1y
1,2020-12-11,id_60a09545f85c5e001b8714ba,id_60a09545f85c5e001b8714ba_all_bands_2020-12-...,rice_1y
2,2020-07-29,id_6094f7b0f85c5e001b870a58,id_6094f7b0f85c5e001b870a58_all_bands_2020-07-...,rice_1y
3,2021-05-05,id_6077a741f85c5e001b86fd52,id_6077a741f85c5e001b86fd52_all_bands_2021-05-...,rice_1y
4,2020-10-27,id_6094ba1df85c5e001b8708f8,id_6094ba1df85c5e001b8708f8_all_bands_2020-10-...,rice_1y
...,...,...,...,...
7776,2021-06-03,id_6054a1b558a7cd00480009e9,id_6054a1b558a7cd00480009e9_all_bands_2021-06-...,sugarcane
7777,2021-03-12,Name_5f2a4ef2868954001c94d1d5,Name_5f2a4ef2868954001c94d1d5_all_bands_2021-0...,sugarcane
7778,2021-01-26,id_6054a1b558a7cd00480009d7,id_6054a1b558a7cd00480009d7_all_bands_2021-01-...,sugarcane
7779,2021-06-23,id_6054a1b558a7cd00480009e3,id_6054a1b558a7cd00480009e3_all_bands_2021-06-...,sugarcane


In [20]:
# Function for opening and loading tif image files
def load_tif(file):
    img = rasterio.open(file).read()
    return img

In [28]:
df['new_filename'] = '../raw_data/' + df['label'] + '/' + df['filename']

In [23]:
df.new_filename

0       ../raw_data/rice_1y/id_6094bb4df85c5e001b87091...
1       ../raw_data/rice_1y/id_60a09545f85c5e001b8714b...
2       ../raw_data/rice_1y/id_6094f7b0f85c5e001b870a5...
3       ../raw_data/rice_1y/id_6077a741f85c5e001b86fd5...
4       ../raw_data/rice_1y/id_6094ba1df85c5e001b8708f...
                              ...                        
7776    ../raw_data/sugarcane/id_6054a1b558a7cd0048000...
7777    ../raw_data/sugarcane/Name_5f2a4ef2868954001c9...
7778    ../raw_data/sugarcane/id_6054a1b558a7cd0048000...
7779    ../raw_data/sugarcane/id_6054a1b558a7cd0048000...
7780    ../raw_data/sugarcane/id_6054a1b558a7cd0048000...
Name: new_filename, Length: 7781, dtype: object

In [29]:
df['bands'] = df['new_filename'].progress_apply(load_tif)

  0%|          | 0/7781 [00:00<?, ?it/s]

In [30]:
df

Unnamed: 0,date,id,filename,label,new_filename,bands
0,2020-07-29,id_6094bb4df85c5e001b870910,id_6094bb4df85c5e001b870910_all_bands_2020-07-...,rice_1y,../raw_data/rice_1y/id_6094bb4df85c5e001b87091...,"[[[938, 865, 865, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,2020-12-11,id_60a09545f85c5e001b8714ba,id_60a09545f85c5e001b8714ba_all_bands_2020-12-...,rice_1y,../raw_data/rice_1y/id_60a09545f85c5e001b8714b...,"[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,2020-07-29,id_6094f7b0f85c5e001b870a58,id_6094f7b0f85c5e001b870a58_all_bands_2020-07-...,rice_1y,../raw_data/rice_1y/id_6094f7b0f85c5e001b870a5...,"[[[0, 0, 0, 0, 0, 577, 907, 907, 0, 0, 0, 0], ..."
3,2021-05-05,id_6077a741f85c5e001b86fd52,id_6077a741f85c5e001b86fd52_all_bands_2021-05-...,rice_1y,../raw_data/rice_1y/id_6077a741f85c5e001b86fd5...,"[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 568, 568, ..."
4,2020-10-27,id_6094ba1df85c5e001b8708f8,id_6094ba1df85c5e001b8708f8_all_bands_2020-10-...,rice_1y,../raw_data/rice_1y/id_6094ba1df85c5e001b8708f...,"[[[0, 0, 0, 5249, 5249, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...,...,...,...,...
7776,2021-06-03,id_6054a1b558a7cd00480009e9,id_6054a1b558a7cd00480009e9_all_bands_2021-06-...,sugarcane,../raw_data/sugarcane/id_6054a1b558a7cd0048000...,"[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
7777,2021-03-12,Name_5f2a4ef2868954001c94d1d5,Name_5f2a4ef2868954001c94d1d5_all_bands_2021-0...,sugarcane,../raw_data/sugarcane/Name_5f2a4ef2868954001c9...,"[[[0, 0, 0, 0, 0, 0, 1099, 1099, 1099, 1099, 1..."
7778,2021-01-26,id_6054a1b558a7cd00480009d7,id_6054a1b558a7cd00480009d7_all_bands_2021-01-...,sugarcane,../raw_data/sugarcane/id_6054a1b558a7cd0048000...,"[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1315, 1315, 1282..."
7779,2021-06-23,id_6054a1b558a7cd00480009e3,id_6054a1b558a7cd00480009e3_all_bands_2021-06-...,sugarcane,../raw_data/sugarcane/id_6054a1b558a7cd0048000...,"[[[0, 0, 1434, 1434, 1434, 1434, 0, 0, 0, 0, 0..."


In [33]:
# Formulas and Computations for Band Combinations and Bands 

combinations = {
    'ndvi': lambda band: (band[8]-band[4])/(band[8] + band[4]), #Compute NDVI
    'mi': lambda band: (band[8]-band[11])/(band[8] + band[11]), #Compute Moisture Index (mi)
    'bc1': lambda band: band[7] + band[6] + band[4], #Compute BC1 (SWIR-1)
    'bc2': lambda band: band[11] + band[8] + band[2], #Compute BC2 (Agriculture)
    'bc3': lambda band: (band[3]-band[8])/(band[3] + band[8]), #Compute BC3 (NWDI)
    'bc4': lambda band: band[12] + band[8] + band[4], #Compute BC4 (SWIR-2)
    'bc5': lambda band: band[4] + band[3] + band[2], #Compute BC5 (Natural Color)
    'bc6': lambda band: (band[8]-band[3])/(band[8] + band[3]), #Compute BC6 (GNDVI)
    'bc8': lambda band: (band[8] * (1 - band[4])*(band[8] - band[4]))**(1/3), #Compute BC8 (AVI)
    'bc9': lambda band: (band[8] - band[4]) / (band[8] + band[4] + 0.428) * (1.428), #Compute BC9 (SAVI)
    'bc10': lambda band: band[11] / band[8], #Compute BC10 (MSI)
    'bc11': lambda band: (band[9] / band[3])-1, #Compute BC11 (GCI)
    'bc12': lambda band: (band[8]-band[12])/(band[8] + band[12]), # Compute BC12 (NBRI)
    'bc13': lambda band: ((band[11] + band[4]) - (band[8] + band[2])) / ((band[11] + band[4]) + (band[8] + band[2])), #Compute BC13 (BSI)
    'bc14': lambda band: (band[3]-band[11])/(band[3] + band[11]), # Compute BC14 (NDSI)
    'bc15': lambda band: (band[3]-band[4])/(band[3] + band[4]) # Compute BC15 (NDGI)
    }
for i in range(13):
  combinations[f'b{i}'] = lambda band: band[i]

In [34]:
#Compute means, medians, and standard deviations for band combinations and bands
for combo, func in combinations.items():
    print(f'Computing features for {combo}')
    bb = df['bands'].apply(func)
    df[f'{combo}_mean'] = bb.apply(np.nanmean)
    df[f'{combo}_std'] = bb.apply(np.nanstd)
    df[f'{combo}_median'] = bb.apply(np.nanmedian)

Computing features for ndvi
Computing features for mi
Computing features for bc1
Computing features for bc2
Computing features for bc3
Computing features for bc4
Computing features for bc5
Computing features for bc6
Computing features for bc8
Computing features for bc9
Computing features for bc10
Computing features for bc11
Computing features for bc12
Computing features for bc13
Computing features for bc14
Computing features for bc15
Computing features for b0
Computing features for b1
Computing features for b2
Computing features for b3
Computing features for b4
Computing features for b5
Computing features for b6
Computing features for b7
Computing features for b8
Computing features for b9
Computing features for b10
Computing features for b11
Computing features for b12


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7781 entries, 0 to 7780
Data columns (total 93 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          7781 non-null   object 
 1   id            7781 non-null   object 
 2   filename      7781 non-null   object 
 3   label         7781 non-null   object 
 4   new_filename  7781 non-null   object 
 5   bands         7781 non-null   object 
 6   ndvi_mean     7781 non-null   float64
 7   ndvi_std      7781 non-null   float64
 8   ndvi_median   7781 non-null   float64
 9   mi_mean       7781 non-null   float64
 10  mi_std        7781 non-null   float64
 11  mi_median     7781 non-null   float64
 12  bc1_mean      7781 non-null   float64
 13  bc1_std       7781 non-null   float64
 14  bc1_median    7781 non-null   float64
 15  bc2_mean      7781 non-null   float64
 16  bc2_std       7781 non-null   float64
 17  bc2_median    7781 non-null   float64
 18  bc3_mean      7781 non-null 

In [36]:
df = df.rename(columns={'label': 'target'})