In [12]:
import glob
import os
import shutil

import cv2
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

In [39]:
df = pd.read_csv('3244 data new.csv')
df

Unnamed: 0,Dataset,Class,Camera,Direction facing,Sex,Driver,Number of pics,Start file,End file
0,unseen,c0,1,Right,M,1,104,000.jpg,00117.jpg
1,unseen,c0,2,Left,F,5,40,10794.jpg,109694.jpg
2,unseen,c1,1,Right,M,1,95,016.jpg,0361.jpg
3,unseen,c1,2,Left,F,5,80,117967.jpg,1393310.jpg
4,unseen,c2,1,Right,M,1,89,024.jpg,0440.jpg
...,...,...,...,...,...,...,...,...,...
447,train,c7,2,Left,M,38,20,1988061.jpg,1988613.jpg
448,train,c7,2,Left,F,39,20,19016032.jpg,19017339.jpg
449,train,c7,2,Left,M,40,20,19107934.jpg,19108682.jpg
450,train,c7,2,Left,F,43,20,19118071.jpg,19118933.jpg


In [8]:
classes = [f'c{i}' for i in range(8)]

In [26]:
# NOTE: file order in above csv might not be the same as file order returned by glob.glob.
# Default order doesn't work because 09999.jpg and 091000.jpg might belong to the same driver, 
# but 091000.jpg will appear much earlier
#
# Use the function below to return it in the same order.
# We could use this info to do very specific forms of sampling, e.g. sampling by driver.

func = lambda x: f'{x.split("/")[-1]:>015.15s}'

# Checking that it works
filepaths = sorted(glob.glob(os.path.join('Distracted Driver Dataset','Combined','train','c7','*')), key=func)
filepaths[:10]

['Distracted Driver Dataset/Combined/train/c7/09188.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09189.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09190.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09191.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09192.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09193.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09194.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09195.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09196.jpg',
 'Distracted Driver Dataset/Combined/train/c7/09197.jpg']

## Creating folders

In [27]:
stem = os.path.join('Distracted Driver Dataset','Combined New')

os.mkdir(stem)

for folder in ['train','test','unseen']:
    os.mkdir(os.path.join(stem, folder))
    
    for cls in classes:
        os.mkdir(os.path.join(stem, folder, cls))

## Creating unseen dataset
Drivers 1 and 5

In [46]:
unknown_drivers = [1,5]
stem = os.path.join('Distracted Driver Dataset','Combined','test')

for driver in unknown_drivers:
    curr_driver = df[df['Driver'] == driver]
    
    for i in range(curr_driver.shape[0]):
        row = curr_driver.iloc[i]
        cls = row.loc['Class']

        filepaths = sorted(glob.glob(os.path.join(stem, cls,'unseen','*.jpg')), key=func)

        arr = []
        is_interest = False

        for filepath in filepaths:
            file = filepath.split("/")[-1]

            if file == row['Start file']:
                is_interest = True

            if is_interest:
                arr.append(filepath)

            if file == row['End file']:
                is_interest = False

        for file in arr:
            dest = os.path.join(stem, cls, 'unseen')
            if os.path.exists(dest) == False:
                os.mkdir(dest)
            shutil.move(file, dest)
            shutil.copy(file, os.path.join('Distracted Driver Dataset', 'Combined New', 'unseen', cls))

In [55]:
# Checking how many files have been copied
files = sorted(glob.glob(os.path.join('Distracted Driver Dataset','Combined New','unseen','c*','*')), key=func)
len(files)

839

In [48]:
# Flipping Camera 2 files

for file in files:
    if file.split("/")[-1][0] == '1':
        img = cv2.imread(file)
        img = cv2.flip(img, 1)
        cv2.imwrite(file, img)

## Splitting the rest of the data

In [548]:
for cls in classes:
    files = glob.glob(os.path.join('Distracted Driver Dataset','Combined','*',cls,'*.jpg')
    train,test = train_test_split(files, test_size = .25, random_state=3244)

    for file in files:
        if file in train:
            shutil.copy(file, os.path.join('Distracted Driver Dataset','Combined New','train',cls))
        else:
            shutil.copy(file, os.path.join('Distracted Driver Dataset','Combined New','test',cls))

In [51]:
# Checking how many files have been copied
files = glob.glob(os.path.join('Distracted Driver Dataset','Combined New','t*','*','*.jpg'))
len(files)

13639

In [567]:
# Flipping Camera 2 images

for file in files:
    if file.split("/")[-1][0] == '1': # Images belonging to camera 2 start with '1'
        img = cv2.imread(file)
        img = cv2.flip(img, 1)
        cv2.imwrite(file, img)

## Checking that dataset has been correctly split

In [54]:
file_counts = []

for folder in sorted(glob.glob(os.path.join('Distracted Driver Dataset','Combined New','t*'))):
    for cls in classes:
        count = len(glob.glob(os.path.join(folder, cls, '*')))
        print(f'{folder}/{cls}: {count}')
        
        file_counts.append(count)

print(sum(file_counts))

Distracted Driver Dataset/Combined New2/test/c0: 711
Distracted Driver Dataset/Combined New2/test/c1: 667
Distracted Driver Dataset/Combined New2/test/c2: 602
Distracted Driver Dataset/Combined New2/test/c3: 264
Distracted Driver Dataset/Combined New2/test/c4: 255
Distracted Driver Dataset/Combined New2/test/c5: 243
Distracted Driver Dataset/Combined New2/test/c6: 242
Distracted Driver Dataset/Combined New2/test/c7: 430
Distracted Driver Dataset/Combined New2/train/c0: 2131
Distracted Driver Dataset/Combined New2/train/c1: 2000
Distracted Driver Dataset/Combined New2/train/c2: 1805
Distracted Driver Dataset/Combined New2/train/c3: 791
Distracted Driver Dataset/Combined New2/train/c4: 762
Distracted Driver Dataset/Combined New2/train/c5: 726
Distracted Driver Dataset/Combined New2/train/c6: 723
Distracted Driver Dataset/Combined New2/train/c7: 1287
13639
