In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import os
from IPython.core.display import display

# first lets look at the data
dataFolder = 'data_cleaned'
wordLists = os.listdir(dataFolder)

#populate the data frame with words
ASL_df = pd.DataFrame(wordLists,columns = ['Word']);
#populate with full paths
ASL_df['Full Path'] = ASL_df['Word'].apply(lambda x: os.path.join(dataFolder,x))
#populate with # samples
ASL_df['# samples'] = ASL_df['Full Path'].apply(lambda x: len(os.listdir(x)))

#sort them by largest or smallest
ASL_df = ASL_df.sort_values(by=['# samples'],ascending = False)

print('Words with the most # of examples')
display(ASL_df.head(10))

Words with the most # of examples


Unnamed: 0,Word,Full Path,# samples
46,ART-DESIGN,data_cleaned\ART-DESIGN,32
185,CUTE,data_cleaned\CUTE,30
15,ADVISE-INFLUENCE,data_cleaned\ADVISE-INFLUENCE,24
261,PAST,data_cleaned\PAST,24
207,GUITAR,data_cleaned\GUITAR,24
205,GOVERNMENT,data_cleaned\GOVERNMENT,24
16,AFRAID,data_cleaned\AFRAID,24
95,BIG,data_cleaned\BIG,24
37,ANSWER,data_cleaned\ANSWER,22
18,AGAIN,data_cleaned\AGAIN,22


In [2]:
import cv2

def ImagesFromVideoFile(videoName,desDirectory,wordsampleIndex_int):
    #get video source object
    cap = cv2.VideoCapture(videoName)
    w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    numFrames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    size = (h,w)    
    count = 0
    if(cap.isOpened()):
        while(True):
            #read the captured video
            ret, frame = cap.read()
            
            if(not ret):
                #no more frames so lets exit the loop
                break
            im = cv2.resize(frame, (224, 224), interpolation = cv2.INTER_AREA)
            #now lets just cast it and fix the channels
#             im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            filename = os.path.join(desDirectory,'pic_{0}_{1}.png'.format(wordsampleIndex_int,count))

            retval = cv2.imwrite(filename, im)
            count += 1
    else:
        print('Something wrong with VideoCapture')
    #release the object
    cap.release()

# now for each one of these words lets process them and create folders with pictures
import time 
NumClasses = 10
baseDirectory = 'data_Images'
#lets create the destination directory
if not os.path.exists(baseDirectory):
    os.makedirs(baseDirectory)
    
for index, row in ASL_df.head(NumClasses).iterrows():
    #get the full paths of the videos
    videoNameSources = [os.path.join(row['Full Path'],x) for x in os.listdir(row['Full Path'])]
    destDirectory = os.path.join(baseDirectory,row['Word'])
    if not os.path.exists(destDirectory):
        os.makedirs(destDirectory)
    for i, videoName in enumerate(videoNameSources):
        ImagesFromVideoFile(videoName,destDirectory,i)
    print('Class : {} Images have been created'.format(row['Word']))
    

Class : ART-DESIGN Images have been created
Class : CUTE Images have been created
Class : ADVISE-INFLUENCE Images have been created
Class : PAST Images have been created
Class : GUITAR Images have been created
Class : GOVERNMENT Images have been created
Class : AFRAID Images have been created
Class : BIG Images have been created
Class : ANSWER Images have been created
Class : AGAIN Images have been created


In [3]:
# first we have to split the data into train and validation
import os
import random
baseDirectory = 'data_Images'
ValDirectory = 'data_Images_val'

if not os.path.exists(ValDirectory):
    os.makedirs(ValDirectory)

def train_test_foldSplit(baseDirectory,ValDirectory,testSize = .2):
    trainSize = 1-testSize
    for className in os.listdir(baseDirectory):
        valClassFolder = os.path.join(ValDirectory,className)
        baseClassFolder = os.path.join(baseDirectory,className)
        
        if not os.path.exists(valClassFolder):
            os.makedirs(valClassFolder)
        
        picNames = os.listdir(baseClassFolder)
        numSampled = int(len(picNames)*testSize)
        picNames_sampled = random.sample(picNames,numSampled)
        filePaths_lst = [os.path.join(baseClassFolder,x) for x in picNames_sampled ]
        filePaths_val_lst = [os.path.join(valClassFolder,x) for x in picNames_sampled]
        
        for filebase,fileval in zip(filePaths_lst,filePaths_val_lst):
            os.rename(filebase,fileval)
        
        print('Finished Splitting class {}'.format(className))
    print('Done')
        
train_test_foldSplit(baseDirectory,ValDirectory,testSize = .2)

Finished Splitting class ADVISE-INFLUENCE
Finished Splitting class AFRAID
Finished Splitting class AGAIN
Finished Splitting class ANSWER
Finished Splitting class ART-DESIGN
Finished Splitting class BIG
Finished Splitting class CUTE
Finished Splitting class GOVERNMENT
Finished Splitting class GUITAR
Finished Splitting class PAST
Done
