In [None]:
"""
Dataset:
https://archive.ics.uci.edu/ml/datasets/Online+Video+Characteristics+and+Transcoding+Time+Dataset

Online Video Characteristics Dataset for YouTube Videos - has 2 tsv files:
The first contains 10 columns of fundamental 
video characteristics for 1.6 million youtube videos;
That file - youtube_videos.tsv - is used in this notebook

#######
Video Quality is measured in video bitrate(Kbps).
Only columns (parameters) impacting the video bitrate are retained in the dataframe. others are dropped.
"""

In [193]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [194]:
vdf=pd.read_csv('online_video_dataset/youtube_videos.tsv',delimiter='\t',encoding='utf-8')#,header=None,low_memory=False)

In [195]:
print("Dataframe Shape:",vdf.shape,"\n")

Dataframe Shape: (168286, 11) 



In [196]:
vdf.head()

Unnamed: 0,id,duration,bitrate,bitrate(video),height,width,frame rate,frame rate(est.),codec,category,url
0,uDNj-_5ty48,267,373,274,568,320,29.97,0.0,h264,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
1,uDNj-_5ty48,267,512,396,480,270,29.97,29.97,h264,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
2,uDNj-_5ty48,267,324,263,400,226,29.97,29.97,flv1,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
3,uDNj-_5ty48,267,85,55,176,144,12.0,12.0,mpeg4,Music,http://r2---sn-ovgq0oxu-5goe.c.youtube.com/vid...
4,WCgt-AactyY,31,1261,1183,640,480,24.0,0.0,h264,People & Blogs,http://r1---sn-ovgq0oxu-5goe.c.youtube.com/vid...


In [197]:
vdf.drop(['id','duration','frame rate(est.)','url','category','codec'],axis=1,inplace=True)

In [198]:
vdf=vdf[["bitrate(video)","bitrate","height","width","frame rate"]]
print(vdf.shape)
vdf.head()

(168286, 5)


Unnamed: 0,bitrate(video),bitrate,height,width,frame rate
0,274,373,568,320,29.97
1,396,512,480,270,29.97
2,263,324,400,226,29.97
3,55,85,176,144,12.0
4,1183,1261,640,480,24.0


In [199]:
vdf.describe()

Unnamed: 0,bitrate(video),bitrate,height,width,frame rate
count,168286.0,168286.0,168286.0,168286.0,168286.0
mean,624.363025,730.62149,561.018706,368.399701,24.564592
std,860.955654,919.15473,359.071569,201.27418,7.396615
min,0.0,0.0,100.0,88.0,0.0
25%,231.0,289.0,320.0,240.0,23.98
50%,349.0,459.0,480.0,360.0,29.92
75%,640.0,826.0,640.0,480.0,29.97
max,22229.0,22421.0,2592.0,1944.0,59.94


In [200]:
print(vdf.dtypes)

bitrate(video)      int64
bitrate             int64
height              int64
width               int64
frame rate        float64
dtype: object


In [152]:
###TRIED TO USE LABEL ENCODER WITH CODEC(STRING) COLUMN INCLUDED.
###HOWEVER, THE PREDICTION FOR UNKNOW DATA GIVES ALL ZERO 

###Codec Column is string Object. 
###Therefore we first need to apply LabelEncoder to it before converting it into float for normalization

#from sklearn import preprocessing
#from sklearn.preprocessing import LabelEncoder
#from sklearn_pandas import DataFrameMapper, cross_val_score
#from collections import defaultdict
#vdf_cols=["bitrate(video)","bitrate","height","width","frame rate","codec"]
#le=preprocessing.LabelEncoder()
#cols = [(vdf_cols[i], LabelEncoder()) for i,col in enumerate(vdf_cols)]
#mapper = DataFrameMapper(cols)
#fit_array=mapper.fit_transform(vdf.copy())
#d=defaultdict(LabelEncoder)
#fit_vdf=vdf.apply(lambda x: d[x.name].fit_transform(x))

In [201]:
from sklearn import preprocessing
float_array = vdf.values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
scaled_array = min_max_scaler.fit_transform(float_array)
vdf_normalized = pd.DataFrame(scaled_array,columns = ["bitrate(video)","bitrate","height","width","frame rate"])
print("Dataframe shape: ",vdf_normalized.shape,"\n") # 392 rows, 7 columns + header

Dataframe shape:  (168286, 5) 



In [202]:
#print(fit_vdf.shape)
#fit_vdf.head()
vdf_normalized.head()

Unnamed: 0,bitrate(video),bitrate,height,width,frame rate
0,0.012326,0.016636,0.187801,0.125,0.5
1,0.017815,0.022836,0.152488,0.09806,0.5
2,0.011831,0.014451,0.120385,0.074353,0.5
3,0.002474,0.003791,0.030498,0.030172,0.2002
4,0.053219,0.056242,0.216693,0.211207,0.4004


In [203]:
#fit_vdf.apply(lambda x: d[x.name].inverse_transform(x))

In [129]:
#print(vdf_normalized[0:5])
#print(vdf_normalized[-5:])

In [204]:
#X = fit_vdf.drop('bitrate(video)',axis=1)
#y = fit_vdf['bitrate(video)']
X = vdf_normalized.drop('bitrate(video)',axis=1)
y = vdf_normalized['bitrate(video)']

In [205]:
# Split data to training and test
from sklearn.model_selection import train_test_split
# Split X and y into X_ and y_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print("X_train shape: ",X_train.shape) #126214 rows 5 cols
print("y_train shape: ",y_train.shape) #126214 rows 1 col
print("X_test shape: ",X_test.shape)  #42072 rows 5 cols
print("y_test shape: ",y_test.shape)  #42072 rows 1 col
# print(y_train)

X_train shape:  (126214, 4)
y_train shape:  (126214,)
X_test shape:  (42072, 4)
y_test shape:  (42072,)


In [206]:
# Training models in Python dictionary
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLars
from sklearn.linear_model import BayesianRidge

# Next line is "dictionary" data structure from class 1
d_models = {"Linear_Regression": LinearRegression(),
            "Ridge": Ridge(alpha=0.5),
            "Lasso": Lasso(alpha=0.1),
            "LassoLars": LassoLars(alpha=0.1),
            "BayesianRidge": BayesianRidge()}
models_list = d_models.keys()
print(models_list)

dict_keys(['Linear_Regression', 'Ridge', 'Lasso', 'LassoLars', 'BayesianRidge'])


In [207]:
for regression_model in models_list:
    regressor = d_models[regression_model]
    regressor.fit(X_train,y_train)
    y_predict = regressor.predict(X_test)
    regression_model_mse = mean_squared_error(y_predict, y_test)
    print(regression_model," ",math.sqrt(regression_model_mse))
    
##From the output the one with least sqrt of (MeanSquareError) is the best fit.
##In this case it would be Linear Regression

Linear_Regression   0.007415520125717773
Ridge   0.007420566178463407
Lasso   0.03857602059335691
LassoLars   0.03857602059335691
BayesianRidge   0.007415522481238007


In [208]:
# Training model
###Regression one more time with winning model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
print(lm.intercept_)
print(lm.coef_)
print ("Number of coefficients is ",len(lm.coef_))

-0.0015579309463007798
[ 0.8645261   0.01772181  0.00653458 -0.00683659]
Number of coefficients is  4


In [209]:
import math
from sklearn.metrics import mean_squared_error
y_predict = lm.predict(X_test)
regression_model_mse = mean_squared_error(y_predict, y_test)
print(math.sqrt(regression_model_mse))
print(y_predict)

0.007415520125717773
[0.00966152 0.02312663 0.0157859  ... 0.03263172 0.0092249  0.02128569]


In [210]:
# Try to make predictions on unknown data
#Put bitrate(video) = 1300 as dummy value
# bitrate-1087 
# height in pixels – 854
# width in pixels – 480
# frame rate – 29.97

In [211]:
first_test = np.array([[1300,1087,854,480,29.97]])
print("Starting predict request:")
#first_test=pd.DataFrame(first_test, columns=vdf_cols)
first_test

Starting predict request:


array([[1300.  , 1087.  ,  854.  ,  480.  ,   29.97]])

In [212]:
####NOT FOR MIN_MAX_SCALER

#le=preprocessing.LabelEncoder()
#cols = [(first_test_cols[i], LabelEncoder()) for i,col in enumerate(first_test_cols)]
#mapper = DataFrameMapper(cols)
#first_test_fit_arr=mapper.fit_transform(first_test.copy())
#first_test_float_arr=first_test_fit_arr.astype(float)
#first_test_transform = mapper.fit_transform(first_test) # apply the same transform
#first_test_transform=first_test.apply(lambda x: d[x.name].fit_transform(x))
#first_test_transform=le.transform(first_test)
#print("Normalized new test row :",first_test_transform)

In [213]:
first_test_transform = min_max_scaler.transform(first_test) # apply the same transform
print("Normalized new test row :",first_test_transform)

Normalized new test row : [[0.05848216 0.04848133 0.30256822 0.2112069  0.5       ]]


In [214]:
test_1 = list(first_test_transform[0][1:5]) # Extract all values excluding bitrate(video)
test_seq = [test_1] # make a list of list -because the model understands list of list
test_seq

#test_1 = list(first_test_transform[:,[0,2,3,4,5]]) # Extract all values excluding bitrate(video) 
#test_1

[[0.048481334463226444, 0.3025682182985554, 0.21120689655172414, 0.5]]

In [215]:
test_result = lm.predict(test_seq) # apply model prediction
print("Test result bitrate(video) (still normalized): ",test_result[0]) # this is still normalized mpg result

Test result bitrate(video) (still normalized):  0.043679360875558106


In [216]:
first_test_transform[0][0] = test_result[0] # put result back, replace dummy with predicted
print("Test row with predicted (still normalized): ",first_test_transform)

Test row with predicted (still normalized):  [[0.04367936 0.04848133 0.30256822 0.2112069  0.5       ]]


In [218]:
result = min_max_scaler.inverse_transform(first_test_transform) # apply inverse transform
print("Final test result: ",result)
#The actual Value is 1187----SO PRETTY CLOSE :)

Final test result:  [[ 970.9485129 1087.         854.         480.          29.97     ]]
