# Dataset creation from raw data shared

In [None]:
import subprocess
import urllib
import cv2
import os
import glob
import datetime
import matplotlib.pyplot as plt
import plotly.express as px

import numpy as np
import pandas as pd

import torch
import torchvision
from torchsummary import summary 
from torchviz import make_dot
from torch.utils.data import Dataset, DataLoader

import albumentations as A

from PIL import Image

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchvision import datasets, transforms
import torchvision.models as models

from torch.utils.tensorboard import SummaryWriter
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR, CyclicLR, ReduceLROnPlateau

from tqdm import tqdm

### Load downloaded data from shared drive

In [None]:
!mkdir train
!mkdir ds
!unzip -qq "/content/drive/My Drive/drone_ds/drive-download-20200723T120429Z-003.zip" -d "./ds"
!unzip -qq "/content/drive/My Drive/drone_ds/drive-download-20200723T120429Z-002.zip" -d "./ds"
!unzip -qq "/content/drive/My Drive/drone_ds/drive-download-20200723T120429Z-001.zip" -d "./ds"

### Cleaning the dataset


In [None]:
paths = glob.glob('ds/*/*.*')
classes = [path.split('/')[1] for path in paths]
sizes = [os.path.getsize(path) for path in paths]
df = pd.DataFrame({'paths': paths, 'classes': classes, 'sizes': sizes})
df.head()

Unnamed: 0,paths,classes,sizes
0,ds/Large QuadCopters/pexels-photo-343238.jpeg,Large QuadCopters,143230
1,ds/Large QuadCopters/istockphoto-1147862051-61...,Large QuadCopters,12421
2,ds/Large QuadCopters/Capture27.JPG,Large QuadCopters,87456
3,ds/Large QuadCopters/5655a787b0988e9ba58f7dbaf...,Large QuadCopters,31953
4,ds/Large QuadCopters/agriculture-drone-500x500...,Large QuadCopters,27890


In [None]:
# Drop duplitate size files
df = df.drop_duplicates(subset=['sizes']).reset_index(drop=True)
# Get extensions
df['extensions'] = df['paths'].apply(lambda path: os.path.splitext(path)[1])

In [None]:
fig = px.histogram(df, x='extensions', width=800, height=400)
fig.update_layout(title='Image type distribution')
fig.show()

### Create new directory and copy required files

In [None]:
train_paths = []
width = []
height = []

for idx, path in tqdm(enumerate(df['paths'])):
  try:
    train_path = f'train/img_{idx:06d}.jpg' 
    img = Image.open(path)
    img = img.convert('RGB')
    img.save(train_path, 'JPEG', quality=80, optimize=True, progressive=True)
    train_paths.append(train_path)
    w, h = img.size
    width.append(w)
    height.append(h)
  except Exception as ex:
    print(path)
    train_paths.append(None)
    width.append(None)
    height.append(None)

307it [00:15, 30.96it/s]

ds/Large QuadCopters/pic_322.txt


576it [00:33, 17.54it/s]

ds/Large QuadCopters/pic_323.txt



Palette images with Transparency expressed in bytes should be converted to RGBA images


Corrupt EXIF data.  Expecting to read 4 bytes but only got 0. 

15852it [19:58, 13.22it/s]


In [None]:
df['train_paths'] = train_paths
df['w'] = width
df['h'] = height

In [None]:
fig = px.scatter(x=width, y=height, width=800, height=800)
fig.update_layout(title='Image size distribution', xaxis_title='width', yaxis_title='height')
fig.show()

### Create dataset information file

In [None]:
min_w = 100
min_h = 100

df[df['w'] == None] = 0
df[df['h'] == None] = 0

df['valid'] = (df['w'] > min_w) & (df['h'] > min_h)

df.to_csv('dataset_info.csv', index=False)

### Push dataset back to drive

In [None]:
!mv dataset_info.csv '/content/drive/My Drive/drone_ds/'

In [None]:
!zip -q -r dataset.zip train
!mv dataset.zip '/content/drive/My Drive/drone_ds/'
!stat '/content/drive/My Drive/drone_ds/dataset.zip'

  File: /content/drive/My Drive/drone_ds/dataset.zip
  Size: 1042351689	Blocks: 2035844    IO Block: 65536  regular file
Device: 53h/83d	Inode: 1097        Links: 1
Access: (0600/-rw-------)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2020-07-24 06:13:32.000000000 +0000
Modify: 2020-07-24 06:14:20.000000000 +0000
Change: 2020-07-24 06:14:21.000000000 +0000
 Birth: -
