In [1]:
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np

matplotlib.use("Agg")
import pandas as pd
import glob, cv2

home='/home/ecvol/data/lipread/gridcorpus/'
file1 = 'pwbkzn'
im1 = glob.glob('/home/ecvol/data/lipread/gridcorpus/video/s1/'+file1+'/*')

### read align file (GT)

In [9]:
gt = pd.read_csv(home+'align/'+file1+'.align', sep=' ', header=None)
gt[1] = (gt[1]/1000).astype('uint8')
gt[0] = (gt[0]/1000).astype('uint8')
gt = gt.rename(columns={0:'start', 1:'end', 2:'gt'})
gt #start and end refer to frame indices

Unnamed: 0,start,end,gt
0,0,14,sil
1,14,22,place
2,22,28,white
3,28,31,by
4,31,36,k
5,36,46,zero
6,46,54,now
7,54,74,sil


### generate input

In [10]:
x = np.zeros((8,5000,20)) #make sure all tensors are same length (0 padded)

for row in gt.index:
    frames = gt.iloc[row][:2]
    temp=[]
    for i in range(frames[0], frames[1]):
        # TCN expects flattened input
        temp.append(np.expand_dims(cv2.imread(im1[i], 0).flatten(), -1).astype('uint8'))
    val =(np.expand_dims(np.concatenate(temp, -1), 0))
    print(val.shape)  #notice that the last axis corresponds to length of time in gt
    x[row,...,:val.shape[-1]]=val # add to 0 padded yensor
    
y = np.array([0,1,2,3,4,5,6,0])

(1, 5000, 14)
(1, 5000, 8)
(1, 5000, 6)
(1, 5000, 3)
(1, 5000, 5)
(1, 5000, 10)
(1, 5000, 8)
(1, 5000, 20)


In [4]:
x=np.moveaxis(x, -1, 1)
x.shape, y.dtype

((8, 20, 5000), dtype('int64'))

In [11]:
from keras.utils import to_categorical
y = to_categorical(y) #this is very naive (1-hot), we need to do a better representation here, maybe BERT!!! 

In [6]:
gt[2], y 

(0      sil
 1    place
 2    white
 3       by
 4        k
 5     zero
 6      now
 7      sil
 Name: 2, dtype: object,
 array([[1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0., 0., 0.]], dtype=float32))

In [6]:
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input, Model

from tcn import TCN, tcn_full_summary

batch_size, timesteps, input_dim = None, 20, 5000

#https://github.com/philipperemy/keras-tcn#why-temporal-convolutional-network

i = Input((timesteps, input_dim))

o = TCN(kernel_size=3, activation='relu',return_sequences=False)(i)  # The TCN layers are here.
o = Dense(7, activation='softmax')(o)

m = Model(inputs=[i], outputs=[o])
m.compile(optimizer='adam', loss='categorical_crossentropy')

tcn_full_summary(m, expand_residual_blocks=False)

m.fit(x, y, epochs=100,batch_size=8)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 20, 5000)]        0         
_________________________________________________________________
residual_block_0 (ResidualBl [(None, 20, 64), (None, 2 1292480   
_________________________________________________________________
residual_block_1 (ResidualBl [(None, 20, 64), (None, 2 24704     
_________________________________________________________________
residual_block_2 (ResidualBl [(None, 20, 64), (None, 2 24704     
_________________________________________________________________
residual_block_3 (ResidualBl [(None, 20, 64), (None, 2 24704     
_________________________________________________________________
residual_block_4 (ResidualBl [(None, 20, 64), (None, 2 24704     
_________________________________________________________________
residual_block_5 (ResidualBl [(None, 20, 64), (None, 2 2470

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f9d3c6b32e8>

In [10]:
m.predict(x)

array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [11]:
y

array([[1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

### Sanity check of frame extraction by reforming into video

In [None]:
file2 = 'sria6n'
im2 = glob.glob('/home/ecvol/data/lipread/gridcorpus/video/s1/'+file2+'/*')
frame = cv2.imread((im2[0]))
height, width, layers = frame.shape

video = cv2.VideoWriter('temp.avi', 0, 25, (width,height))

for image in im2:
    video.write(cv2.imread(image))

cv2.destroyAllWindows()
video.release()