In [21]:
# This notebook is only for preprocessing the data which contains:
    # 1. Importing the dataframes sourced from SQL
    # 2. Pivotting the LABEL table's values with label_value and maks_probability columns
    # 3. Merging the clip_encode data with pivot table created on the previous step
    # 4. Dropping unwanted columns such as ID columns & VMAF
    # 5. Train and Test data split with 20%
    # 6. Converting categorical columns to numericals
    # 7. Exporting Train and Test datasets
    # /// please use exported datasets to build models ///

In [22]:
# Import Libraries
import pandas as pd
from sklearn import preprocessing
from joblib import dump, load
import numpy as np
from sklearn.model_selection import train_test_split

In [23]:
# Load clip & encode dataset
df_clip_encode = pd.read_csv("../clip_encode_merged_data.csv", encoding = 'utf8')
df_clip_encode.head(1)

Unnamed: 0,encode_id,clip_id,encode_width,encode_height,clip_width,clip_height,clip_duration,clip_size,clip_bitrate_total,video_profile,clip_frame_rate,crf,encode_bitrate_video,nr_of_images,nr_of_image_shifts,psnr,vmaf
0,7,5,1920,1080,1920,1080,19,41249088,17350686,High 4:2:2,30000/1001,18,18667784.0,3,8,44.62155,99.406406


In [24]:
# Load Label dataset
df_label = pd.read_csv("../label_data.csv", encoding = 'utf8')
df_label.head(5)

Unnamed: 0,label_clip_id,label_value,maks_probability
0,5,"chime, bell, gong",57.84
1,5,"spider web, spider's web",68.54
2,5,"flagpole, flagstaff",60.99
3,8,"mobile home, manufactured home",56.77
4,6,"African chameleon, Chamaeleo chamaeleon",81.73


In [25]:
# Load Scene Change dataset
df_scene = pd.read_csv("../scene_change.csv", encoding = 'utf8')
df_scene.head(5)

Unnamed: 0,scene_clip_id,scene_percentage,nr_of_scene_changes
0,6,probability_0.1,11
1,6,probability_0.2,9
2,6,probability_0.3,9
3,6,probability_0.4,9
4,6,probability_0.5,5


## Pivotting Label dataset

In [26]:
df_label_pivot = pd.pivot_table(df_label, values='maks_probability', index=['label_clip_id'],
                     columns=['label_value'], aggfunc=np.sum, fill_value = 0)
df_label_pivot.head(5)

label_value,"African chameleon, Chamaeleo chamaeleon","American egret, great white heron, Egretta albus",Dutch oven,abaya,alp,altar,analog clock,"balance beam, beam",balloon,"ballplayer, baseball player",...,trench coat,turnstile,"vacuum, vacuum cleaner","wardrobe, closet, press","web site, website, internet site, site",wig,window screen,window shade,wine bottle,"worm fence, snake fence, snake-rail fence, Virginia fence"
label_clip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,81.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,56.98,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,73.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Pivotting Scene Change dataset

In [27]:
df_scene_pivot = pd.pivot_table(df_scene, values='nr_of_scene_changes', index=['scene_clip_id'],
                     columns=['scene_percentage'], aggfunc=np.sum, fill_value = 0)
df_scene_pivot.head(5)

scene_percentage,probability_0.1,probability_0.2,probability_0.3,probability_0.4,probability_0.5,probability_0.6,probability_0.7,probability_0.8,probability_0.9
scene_clip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6,11,9,9,9,5,3,0,0,0
7,1826,114,88,56,24,6,2,0,0
8,1,1,1,1,1,1,1,0,0
9,11,9,9,9,5,3,0,0,0
11,11,9,9,9,5,3,0,0,0


## Merge datasets: clip & encode & scene with label on clip_id

In [28]:
df_merged = df_clip_encode.merge(df_label_pivot, how = 'inner', left_on='clip_id', right_on='label_clip_id')
df_merged = df_merged.merge(df_scene_pivot, how = 'inner', left_on = 'clip_id', right_on = 'scene_clip_id')
df_merged.head()

Unnamed: 0,encode_id,clip_id,encode_width,encode_height,clip_width,clip_height,clip_duration,clip_size,clip_bitrate_total,video_profile,...,"worm fence, snake fence, snake-rail fence, Virginia fence",probability_0.1,probability_0.2,probability_0.3,probability_0.4,probability_0.5,probability_0.6,probability_0.7,probability_0.8,probability_0.9
0,140,6,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,...,0.0,11,9,9,9,5,3,0,0,0
1,141,6,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,...,0.0,11,9,9,9,5,3,0,0,0
2,142,6,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,...,0.0,11,9,9,9,5,3,0,0,0
3,143,6,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,...,0.0,11,9,9,9,5,3,0,0,0
4,144,6,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,...,0.0,11,9,9,9,5,3,0,0,0


In [29]:
# drop unwanted columns
df_merged = df_merged.drop(columns=['encode_id', 'clip_id', 'psnr'], axis=1)
#df_merged = df_merged.drop(columns=['encode_id', 'clip_id', 'psnr', 'nr_of_images', 'nr_of_image_shifts'], axis=1)

In [30]:
#drop NULL rows
#here I simply dropped the null values and ofcourse lost some information. Later I will try to ampute data with mean.
print(df_merged.shape)
df_merged = df_merged.dropna()
print(df_merged.shape)

(5234, 160)
(4601, 160)


In [31]:
df_merged.head(5)

Unnamed: 0,encode_width,encode_height,clip_width,clip_height,clip_duration,clip_size,clip_bitrate_total,video_profile,clip_frame_rate,crf,...,"worm fence, snake fence, snake-rail fence, Virginia fence",probability_0.1,probability_0.2,probability_0.3,probability_0.4,probability_0.5,probability_0.6,probability_0.7,probability_0.8,probability_0.9
0,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,30000/1001,18,...,0.0,11,9,9,9,5,3,0,0,0
1,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,30000/1001,19,...,0.0,11,9,9,9,5,3,0,0,0
2,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,30000/1001,20,...,0.0,11,9,9,9,5,3,0,0,0
3,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,30000/1001,23,...,0.0,11,9,9,9,5,3,0,0,0
4,1920,1080,1920,1080,19,378608416,159254815,High 4:2:2,30000/1001,25,...,0.0,11,9,9,9,5,3,0,0,0


## Test & Train Split

In [32]:
df_Y = df_merged[['vmaf']]
df_X = df_merged.drop('vmaf', axis=1)
df_train, df_test, df_train_y, df_test_y = train_test_split(df_X, df_Y, test_size=0.2, random_state=0) 
print(df_train.shape, df_test.shape)
print(df_train_y.shape, df_test_y.shape)

(3680, 159) (921, 159)
(3680, 1) (921, 1)


## Label Binarizer: Converting set of categorical columns to numeric

In [33]:
categorical_columns =['encode_width', 'encode_height', 'clip_width', 'clip_height','video_profile', 'clip_frame_rate'] 

for column in categorical_columns:
    lb = preprocessing.LabelBinarizer()
    lb.fit(df_train[column].values.reshape(-1,1))
    dump(lb, f"{column}_label_binarizer.pkl")
    binarized_array_train = lb.transform(df_train[column].values.reshape(-1,1))
    column_names = [f"{column}_{i}" for i in list(lb.classes_)]
    
    binarized_df_train = pd.DataFrame(data=binarized_array_train, columns=column_names, index=df_train.index)
    df_train = pd.concat([df_train, binarized_df_train], axis=1, sort=False)
    df_train.drop(columns=[column], inplace=True)

    binarized_array_test = lb.transform(df_test[column].values.reshape(-1,1))
    binarized_df_test = pd.DataFrame(data=binarized_array_test, columns=column_names, index=df_test.index)
    df_test = pd.concat([df_test, binarized_df_test], axis=1, sort=False)
    df_test.drop(columns=[column], inplace=True)
    print(column + ' is converted')
    print(df_train.shape, df_test.shape)

encode_width is converted
(3680, 165) (921, 165)
encode_height is converted
(3680, 170) (921, 170)
clip_width is converted
(3680, 170) (921, 170)
clip_height is converted
(3680, 173) (921, 173)
video_profile is converted
(3680, 177) (921, 177)
clip_frame_rate is converted
(3680, 182) (921, 182)


## Export datasets: Test & Train

In [34]:
export_df_train = df_train.to_csv (r'export_df_train.csv', index = None, header=True) 

In [35]:
export_df_test = df_test.to_csv (r'export_df_test.csv', index = None, header=True) 

In [36]:
export_df_train_y = df_train_y.to_csv (r'export_df_train_y.csv', index = None, header=True) 

In [37]:
export_df_test_y = df_test_y.to_csv (r'export_df_test_y.csv', index = None, header=True) 

### / please use exported datasets to build models /