# VIDEO CLASSIFICATION


#### Video classification is the process of automatically assigning a video to one or more predefined categories,groups or labels based on its content. 

### Directory

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as olt
import os

# location of my training dataset
train_path=os.listdir('D:/nlp/vcdataset/train')

# labels
label=os.listdir('D:/nlp/vcdataset/train')
print(label)

['badminton', 'cricket', 'football', 'swimming', 'tennis', 'volleyball']


### Preparing Training Data

In [2]:
datas=[]

for item in train_path:
    #get all the file names
    all_datas=os.listdir('D:/nlp/vcdataset/train'+'/'+item)
 
    #Add them to the list
    for data in all_datas:
        datas.append((item,str('D:/nlp/vcdataset/train'+'/'+item)+'/'+data))
    
#Build a dataframe for training 
train_df=pd.DataFrame(data=datas,columns=['tag','video_name'])
print(train_df.head())
print(train_df.tail())

         tag                                video_name
0  badminton   D:/nlp/vcdataset/train/badminton/v1.mp4
1  badminton  D:/nlp/vcdataset/train/badminton/v10.mp4
2  badminton   D:/nlp/vcdataset/train/badminton/v2.mp4
3  badminton   D:/nlp/vcdataset/train/badminton/v3.mp4
4  badminton   D:/nlp/vcdataset/train/badminton/v4.mp4
           tag                                video_name
55  volleyball  D:/nlp/vcdataset/train/volleyball/v5.mp4
56  volleyball  D:/nlp/vcdataset/train/volleyball/v6.mp4
57  volleyball  D:/nlp/vcdataset/train/volleyball/v7.mp4
58  volleyball  D:/nlp/vcdataset/train/volleyball/v8.mp4
59  volleyball  D:/nlp/vcdataset/train/volleyball/v9.mp4


In [3]:
df=train_df.loc[:,['video_name','tag']]
df
df.to_csv('train.csv')

### Preparing Testing Dataset

In [8]:
# location of my testing dataset
test_path=os.listdir('D:/nlp/vcdataset/test')
print(test_path)

# labels
label=os.listdir('D:/nlp/vcdataset/test')
print("categorization:",len(test_path))# no of labels

datas=[]

for item in test_path:
    #get all the filenames
    all_datas=os.listdir('D:/nlp/vcdataset/test'+'/'+item)
    
    #Add them to the list
    for data in all_datas:
        datas.append((item,str('D:/nlp/vcdataset/test'+'/'+item)+'/'+data))

test_df=pd.DataFrame(data=datas,columns=['tag','video_name'])
print(test_df.head())
print(test_df.tail())

['badminton', 'cricket', 'football', 'swimming', 'tennis', 'volleyball']
categorization: 6
         tag                                         video_name
0  badminton  D:/nlp/vcdataset/test/badminton/15 Badminton R...
1    cricket  D:/nlp/vcdataset/test/cricket/Direct Hit! Some...
2   football  D:/nlp/vcdataset/test/football/Newcastle Unite...
3   swimming  D:/nlp/vcdataset/test/swimming/How to Swim Fre...
4     tennis  D:/nlp/vcdataset/test/tennis/The Most Brutal A...
          tag                                         video_name
1     cricket  D:/nlp/vcdataset/test/cricket/Direct Hit! Some...
2    football  D:/nlp/vcdataset/test/football/Newcastle Unite...
3    swimming  D:/nlp/vcdataset/test/swimming/How to Swim Fre...
4      tennis  D:/nlp/vcdataset/test/tennis/The Most Brutal A...
5  volleyball  D:/nlp/vcdataset/test/volleyball/Highlights _ ...


In [9]:
df=test_df.loc[:,['video_name','tag']]
df
df.to_csv('test.csv')

In [5]:
!pip install git+https://github.com/tensorflow/docs

Collecting git+https://github.com/tensorflow/docs
  Cloning https://github.com/tensorflow/docs to c:\users\hebli\appdata\local\temp\pip-req-build-um9dvjk5
  Resolved https://github.com/tensorflow/docs to commit 393d7b6fa6aa68a8e01013582cbdb5c0fe4fde1c
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/tensorflow/docs 'C:\Users\hebli\AppData\Local\Temp\pip-req-build-um9dvjk5'


### Importing libraries

In [10]:
from tensorflow_docs.vis import embed # used for visualizing tensorflow graphs
from tensorflow import keras
from imutils import paths # used for editing like croping,resizing the performance done by opencv

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2 # opencv for image processing 
import os #read or write on an image

In [11]:
#to limit the amount of memory that tensorflow can use in gpu
#we use gpu so it saves a lot of time for training or computational works
gpus=tf.config.experimental.list_physical_devices('GPU')# to list the gpu devices that tensorflow can see
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
        gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])# Limiting memory to 5120
    except RuntimeError as e:
        print(e)

### Data Preparation

In [12]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

print("Total videos for training:",len(train_df))
print("Total videos for testing:",len(test_df))

train_df.sample(10)

Total videos for training: 60
Total videos for testing: 6


Unnamed: 0.1,Unnamed: 0,video_name,tag
25,25,D:/nlp/vcdataset/train/football/v5.mp4,football
21,21,D:/nlp/vcdataset/train/football/v10.mp4,football
49,49,D:/nlp/vcdataset/train/tennis/v9.mov,tennis
39,39,D:/nlp/vcdataset/train/swimming/v9.mov,swimming
36,36,D:/nlp/vcdataset/train/swimming/v6.mov,swimming
53,53,D:/nlp/vcdataset/train/volleyball/v3.mp4,volleyball
55,55,D:/nlp/vcdataset/train/volleyball/v5.mp4,volleyball
51,51,D:/nlp/vcdataset/train/volleyball/v10.mp4,volleyball
19,19,D:/nlp/vcdataset/train/cricket/v9.mov,cricket
24,24,D:/nlp/vcdataset/train/football/v4.mp4,football


### Frame Extraction

In [13]:
IMG_SIZE=224

# to crop the image in center squared
def crop_center_square(frame):
    y,x=frame.shape[0:2]
    min_dim=min(y,x)
    start_x=(x//2)-(min_dim//2)
    start_y=(y//2)-(min_dim//2)
    return frame[start_y :start_y+min_dim , start_x :start_x+min_dim]


# frame extraction #OpenCV
def load_video(path,max_frames=0,resize=(IMG_SIZE,IMG_SIZE)):
    cap=cv2.VideoCapture(path)#capture the video
    frames=[]
    try:
        while True:
            ret, frame =cap.read() #reading frames in loop
            if not ret:
                break
            frame=crop_center_square(frame) # to crop each frame using the function
            frame =cv2.resize(frame,resize)# to resize them according to imagesize 
            frame=frame[:,:,[2,1,0]]#[width,height,color channel[rgb]]
            frames.append(frame)
            
            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)# converting the frames into numpy arrays

### Feature Extraction

In [14]:
def build_feature_extractor():
    # we have used pre-trained Inception v3 model in CNN for feature extraction 
    feature_extractor =keras.applications.InceptionV3(
    weights='imagenet',
    include_top=False,
    pooling='avg',
    input_shape=(IMG_SIZE,IMG_SIZE,3),
    )
    
    # preprocess input images provide by inception v3 model
    preprocess_input = keras.applications.inception_v3.preprocess_input
    
    inputs=keras.Input((IMG_SIZE,IMG_SIZE,3)) #[width,height,color channel]
    preprocessed = preprocess_input(inputs)
    
    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs,name='feature_extractor')

feature_extractor =build_feature_extractor()

### Label Encoding

In [15]:
# to convert a string label into integer and num_oov_indices=0 fro unknown vocabularies
label_processor=keras.layers.StringLookup(num_oov_indices=0,vocabulary=np.unique(train_df['tag']))
print(label_processor.get_vocabulary())

labels=train_df['tag'].values
labels= label_processor(labels[...,None]).numpy()#convert to numpy array
labels

['badminton', 'cricket', 'football', 'swimming', 'tennis', 'volleyball']


array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [4],
       [4],
       [4],
       [4],
       [4],
       [4],
       [4],
       [4],
       [4],
       [4],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5],
       [5]], dtype=int64)

### Hyper parameters

In [16]:
#hhyperparameters

IMG_SIZE=224
BATCH_SIZE=64#no of examples fed to network
EPOCHS=100# no of learning
MAX_SEQ_LENGTH=20#no of frame extracted from each video
NUM_FEATURES=2048# no of features extracted from each frame of video

In [17]:
def prepare_all_videos(df,root_dir):
    num_samples=len(df)
    video_paths=df['video_name'].values.tolist()
    
    labels=df['tag'].values
    
    #label encoding the tag columns
    labels=label_processor(labels[...,None]).numpy()#convert to numpy array
    
    #frame_masks and frame_features are what we will feed to our sequence model.
    #frame_masks will contain a bunch of booleans if the timesteps is masked with padding or not.
    #so both will create an empty array with given dimension for later use
    frame_masks=np.zeros(shape=(num_samples,MAX_SEQ_LENGTH),dtype='bool')
    frame_features=np.zeros(shape=(num_samples,MAX_SEQ_LENGTH,NUM_FEATURES),dtype='float32')
    
    #for each video
    for idx,path in enumerate(video_paths):
        #Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir,path))
        frames = frames[None, ...]
        
        # Initialize placeholders to store the masks adn feature of the current video
        # so again both will create a empty array with given dimension for later use
        temp_frame_mask = np.zeros(shape=(1,MAX_SEQ_LENGTH,),dtype='bool')
        temp_frame_features=np.zeros(
        shape=(1,MAX_SEQ_LENGTH,NUM_FEATURES),dtype='float32'
        )
        
        #Extract feature  from frames of current video
        for i,batch in enumerate(frames):
            video_length = batch.shape[0]
            length =min(MAX_SEQ_LENGTH,video_length)
            for j in range(length):
                temp_frame_features[i,j,:]= feature_extractor.predict (
                  batch[None,j, :]
                )
            temp_frame_mask[i,:length]=1 # 1=not masked(not padded) ,0=masked(padded so should be ignored)
            
        frame_features[idx,]=temp_frame_features.squeeze()
        frame_masks[idx,]=temp_frame_mask.squeeze()
        
    return (frame_features,frame_masks),labels

train_data,train_labels = prepare_all_videos(train_df,'train')
test_data,test_labels= prepare_all_videos(test_df,'test')


print("Frame features in train set:",train_data[0].shape)
print("Frame masks in train set:",train_data[1].shape)

print("train_labels in train set:",train_labels.shape)
print("test_labels in train set:",test_labels.shape)
            

















Frame features in train set: (60, 20, 2048)
Frame masks in train set: (60, 20)
train_labels in train set: (60, 1)
test_labels in train set: (6, 1)


### Sequence modeling

In [18]:
def get_sequence_model():
    class_vocab=label_processor.get_vocabulary()
    #give shape of input expected by the output
    frame_features_input = keras.Input((MAX_SEQ_LENGTH,NUM_FEATURES))
    mask_input= keras.Input((MAX_SEQ_LENGTH,),dtype='bool')
    
    
    
    x=keras.layers.LSTM(16,return_sequences=True)(frame_features_input,mask=mask_input)# to return sequence of output model
    x=keras.layers.LSTM(8)(x)# to return final output of sequence
    x=keras.layers.Dropout(0.4)(x)#drop random layers to prevent overfitting
    x=keras.layers.Dense(8, activation='relu')(x)# linear transformation to input layer using relu
    output=keras.layers.Dense(len(class_vocab),activation='softmax')(x)# to compute final probabilities
    
    
    rnn_model = keras.Model([frame_features_input, mask_input], output)
    
    rnn_model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return rnn_model


EPOCHS=30
#Utility for running experiments.
def run_experiment():
    filepath='./tmp/video_classifier'
    checkpoint=keras.callbacks.ModelCheckpoint(
      filepath,save_weights_only=True,save_best_only=True,verbose=1 #to save best weights of the model
    )
    
    seq_model=get_sequence_model()
    history =seq_model.fit(
        [train_data[0],train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )
    
    seq_model.load_weights(filepath)
    _, accuracy=seq_model.evaluate([test_data[0],test_data[1]],test_labels) #evaluate accuracy
    print("Test accuracy:",round(accuracy*100,2),"%")
    
    return history ,seq_model


_, sequence_model= run_experiment()

Epoch 1/30
Epoch 1: val_loss improved from inf to 1.76802, saving model to ./tmp\video_classifier
Epoch 2/30
Epoch 2: val_loss did not improve from 1.76802
Epoch 3/30
Epoch 3: val_loss did not improve from 1.76802
Epoch 4/30
Epoch 4: val_loss did not improve from 1.76802
Epoch 5/30
Epoch 5: val_loss did not improve from 1.76802
Epoch 6/30
Epoch 6: val_loss did not improve from 1.76802
Epoch 7/30
Epoch 7: val_loss did not improve from 1.76802
Epoch 8/30
Epoch 8: val_loss did not improve from 1.76802
Epoch 9/30
Epoch 9: val_loss did not improve from 1.76802
Epoch 10/30
Epoch 10: val_loss did not improve from 1.76802
Epoch 11/30
Epoch 11: val_loss did not improve from 1.76802
Epoch 12/30
Epoch 12: val_loss did not improve from 1.76802
Epoch 13/30
Epoch 13: val_loss did not improve from 1.76802
Epoch 14/30
Epoch 14: val_loss did not improve from 1.76802
Epoch 15/30
Epoch 15: val_loss did not improve from 1.76802
Epoch 16/30
Epoch 16: val_loss improved from 1.76802 to 1.74700, saving model 

Test accuracy: 50.0 %


### Inference

In [20]:
def prepare_single_video(frames):
    frames=frames[None,...]
    frame_mask=np.zeros(shape=(1,MAX_SEQ_LENGTH,),dtype='bool')
    frame_features = np.zeros(shape=(1,MAX_SEQ_LENGTH,NUM_FEATURES),dtype='float32')
    
    
    for i,batch in enumerate(frames):
        video_length=batch.shape[0]
        length=min(MAX_SEQ_LENGTH,video_length)
        for j in range(length):
            frame_features[i,j,:]=feature_extractor.predict(batch[None, j,:])
        frame_mask[i,:length] =1   #1 = not masked , 0 = masked
        
    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab=label_processor.get_vocabulary()
    
    frames=load_video(os.path.join('test',path))
    frame_features,frame_mask = prepare_single_video(frames)
    probabilities=sequence_model.predict([frame_features, frame_mask])[0]
    
    for i in np.argsort(probabilities)[::-1]:
        print(f"{class_vocab[i]}:{probabilities[i]*100:5.2f}%")# probability of each class
    return frames

test_video=np.random.choice(test_df['video_name'].values.tolist())
print("Test video path:",test_video)

test_frames=sequence_prediction(test_video)

Test video path: D:/nlp/vcdataset/test/swimming/How to Swim Freestyle _ Expert tips from Olympic Champion Stephanie Rice..mp4
swimming:20.02%
tennis:18.20%
cricket:17.81%
football:15.43%
badminton:14.48%
volleyball:14.06%


In [52]:
from IPython.display import HTML

HTML("""
    <video alt="test" width="520" height="440" video controls>
        <source src="D:/nlp/dataset/test/dancing/video3.mp4" type="video/mp4" style="height:300px;width:300px">
    </video>
""")