In [1]:
import os
import rasterio as rio
import tifffile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, random_split, DataLoader
from PIL import Image
import torchvision.models as models
import torchvision.transforms as T
from sklearn.metrics import f1_score
import torch.nn.functional as F
import torch.nn as nn
from pathlib import Path
from torchvision.utils import make_grid
import time
import copy
import re
import glob
import csv
from tqdm import tqdm
%matplotlib widget

## Get available data, more coming...

In [2]:
df = pd.read_csv('filelist/cbna_filtered.csv',sep=',',low_memory=False)
df.shape

(3477371, 10)

In [None]:
files = glob.glob('Data/irc_patches/*.tif')

In [4]:
len(files)

162133

In [3]:
batch_size = 256
start_batch_idx = 0
end_batch_idx = len(files) // batch_size

ToTensor = transforms.ToTensor()
invalid_imgs = []

for batch_idx in tqdm(range(start_batch_idx, end_batch_idx)):
    
    batch_imgs = []
    idx_of_batch = batch_idx * batch_size
    for idx in range(idx_of_batch, idx_of_batch+batch_size):
        img_path = files[idx]
        image = tifffile.imread(img_path)
        image = Image.fromarray(image.astype(np.uint8).transpose(1,2,0))
        image = ToTensor(image).unsqueeze(0)
        batch_imgs.append(image)
    batch_imgs = torch.cat(batch_imgs).cuda()
    
    temp = (batch_imgs == 1.0).sum([1,2,3]) >= 256 * 256 * 3
    invalid_list = idx_of_batch + torch.where(temp == True)[0]
    invalid_imgs += invalid_list.detach().cpu().tolist()
    #print(invalid_imgs)
    

100%|█████████████████████████████████████████| 633/633 [52:48<00:00,  5.01s/it]


In [11]:
batch_imgs = []
idx_of_batch = (batch_idx + 1) * batch_size

for j in tqdm(range(idx+1, len(files))):
    img_path = files[j]
    image = tifffile.imread(img_path)
    image = Image.fromarray(image.astype(np.uint8).transpose(1,2,0))
    image = ToTensor(image).unsqueeze(0)
    batch_imgs.append(image)
    
batch_imgs = torch.cat(batch_imgs).cuda()
temp = (batch_imgs == 1.0).sum([1,2,3]) >= 256 * 256 * 3
invalid_list = idx_of_batch + torch.where(temp == True)[0]
invalid_imgs += invalid_list.detach().cpu().tolist()

100%|███████████████████████████████████████████| 85/85 [00:01<00:00, 49.80it/s]


In [4]:
import re
invalid_id_imgs = []
for i in invalid_imgs:
    filename = files[i]
    s = int(re.findall(r'\d+', filename)[0])
    invalid_id_imgs.append(s)

NameError: name 'files' is not defined

In [3]:
#torch.save(invalid_id_imgs, 'filelist/invalid_img_list.pth')
invalid_id_imgs = torch.load('filelist/invalid_img_list.pth')

In [5]:
res_df = df.loc[~df.id_img.isin(invalid_id_imgs)]

In [6]:
res_df.shape

(3437469, 10)

In [7]:
valid_species = list(np.genfromtxt('filelist/vascular_plants.txt'))
df_valid_species = res_df.loc[res_df.cd_ref.isin(valid_species)]
#df_valid_species.to_csv('filelist/cbna_valid_species.csv', sep=',', encoding='utf-8', index=False)

In [8]:
len(np.unique(df_valid_species.cd_ref.values))

2522

In [9]:
cnt = df_valid_species.groupby('cd_ref')['cd_ref'].transform('count')
df_valid_species = df_valid_species.assign(label_cnt=cnt)

In [10]:
np.min(df_valid_species.label_cnt.values)

30

In [11]:
labelset = df_valid_species[['id_img', 'id_releve', 'cd_ref', 'x_l93', 'y_l93']].groupby(['id_img', 'id_releve'])['cd_ref'].transform(lambda x: ','.join(x.astype(str)))
tailed_df = df_valid_species.assign(labelset=labelset)

In [12]:
res = tailed_df[['id_img', 'id_releve', 'labelset', 'x_l93', 'y_l93']].drop_duplicates()
res.reset_index(drop=True, inplace=True)
res.shape

(158292, 5)

In [14]:
cbna_w_covariates = pd.read_csv('filelist/cbna_w_covariates.csv',sep=',',low_memory=False)

In [15]:
cbna_w_covariates.shape

(160296, 35)

In [17]:
pivot_feature = ['id_releve']
meta_features = ['LANDOLT_MOIST',
       'N_prct', 'pH', 'CN', 'TMeanY', 'TSeason', 'PTotY', 'PSeason', 'RTotY',
       'RSeason', 'AMPL', 'LENGTH', 'eauvive', 'clay', 'silt', 'sand', 'cv_alti']

In [21]:
cbna_final_w_covariates = res.merge(cbna_w_covariates[pivot_feature+meta_features], on='id_releve')

In [25]:
cbna_final_w_covariates.shape

(158292, 22)

In [None]:
#cbna_final_w_covariates.to_csv('filelist/cbna_final_w_covariates.csv', sep=',', encoding='utf-8', index=False)