In [None]:

import warnings
warnings.filterwarnings(action='ignore')

Importing necessary libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from IPython.display import display
%matplotlib inline
plt.style.use('ggplot')

# Importing data and modifying column name


In [None]:
df = pd.read_csv('/kaggle/input/youtube-revenue-prediction/sheet1.csv')
df.head()

Mapping the columns with their appropriate names.

In [None]:
col_map ={'a':'Date',
          'b':'Average_views_per_viewer',
          'c':'Engagement_score',
          'g':'Impression_score',
          'd':'Unique_viewers',
          'p':'Average_viewed',
          'i':'Shares',
          'j':'Likes_vs_dislike',
          'k':'Content_viewability',
          'f':'Impressions',
          'l':'Dislikes',
          'm':'Subscribers_lost',
          'n':'Subscribers_gained',
          'o':'Like',
          't':'Views',
          'r':'Videos_added',
          'q':'Videos_published', 
          's':'Subscribers',
          'e':'Click_rate',
          'h':'Comments',
          'u':'Watch_hours',
          'v':'Average_view_sec',
          'target':'Revenue'}
df = df.rename(columns=col_map)
df.head()



In [None]:
df = df.drop(['Engagement_score', 
              'Impression_score', 
              'Subscribers_lost',
              'Subscribers_gained',
              'Videos_added',
              'id', 
              'Likes_vs_dislike', 
              'Content_viewability',
             ], axis=1)

In [None]:

df.shape
#df.columns
#df.head(2)


# Exploratory Data Analysis (EDA)

In [None]:
df.select_dtypes(include=('object')).head(2)

Converting object type column into types in which necessary opration could be performed

In [None]:

df['Date'] = pd.to_datetime(df['Date'])
df['Average_view_sec'] = pd.to_timedelta(df['Average_view_sec']).dt.seconds


In [None]:
df.isna().sum().reset_index().style.highlight_min()

In [None]:
df.Videos_published.value_counts(dropna=False)

Filling nan values with zero

In [None]:

df.Videos_published = df.Videos_published.fillna(0.0, axis=0)
df.isna().sum().sum()

In [None]:
df.columns

In [None]:
top_corr_name = df.corr()['Revenue'].sort_values(ascending=False).reset_index()['index'][:8].values
sns.pairplot(df[top_corr_name], x_vars=top_corr_name[1:], y_vars=top_corr_name[0])
plt.title('Top 7 Correlations with Target', fontsize=15)
plt.show()

> There are few extreme values


In [None]:
df['Watch_hours'].sort_values()


In [None]:
df['Watch_hours'].mean()

In [None]:
df=df[df['Watch_hours']<=30000]

In [None]:
top_corr_name = df.corr()['Revenue'].sort_values(ascending=False).reset_index()['index'][:8].values
sns.pairplot(df[top_corr_name], x_vars=top_corr_name[1:], y_vars=top_corr_name[0])
plt.title('Top 7 Correlations with Target', fontsize=15)
plt.show()

In [None]:
df.shape

In [None]:
plt.figure(figsize=(20, 6))
sns.lineplot(data=df, x='Date', y='Revenue')
plt.axhline(df.Revenue.mean(), color='c', linestyle='--', linewidth=2)
plt.annotate('Mean of Revenue: $ {}'.format(round(df.Revenue.mean(),4)), 
             (df.Date[20], 30), 
             fontsize=12, 
             color='c')
plt.title('YouTube Revenue by Date', fontsize=20)
plt.ylabel('Revenue (USD, $)')
plt.show()

In [None]:
df.corr()['Revenue'].sort_values(ascending=False).reset_index()[1:].style.bar(align='mid')
     

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Unique_viewers', y='Revenue')
plt.title('Revenue by Unique_viewers', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Like', y='Revenue')
plt.title('Revenue by Like', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Watch_hours', y='Revenue')
plt.title('Revenue by Watch_hours', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.histplot(df['Revenue'], kde=True)
plt.title('YouTube Revenue per day', fontsize=15)
plt.xlabel('Revenue (USD, $)')
plt.show()

In [None]:
df = df[(df['Revenue'] <= 200)].reset_index(drop=True)
plt.figure(figsize=(7,5))
sns.histplot(df['Revenue'], kde=True)
plt.title('YouTube Revenue per day', fontsize=15)
plt.xlabel('Revenue (USD, $)')
plt.show()

In [None]:
df.shape

In [None]:
plt.figure(figsize=(7,5))
sns.histplot(np.log(df['Revenue']), kde=True)
plt.title('YouTube Revenue per day', fontsize=15)
plt.xlabel('Revenue (log transformation)')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), vmin=-1, vmax=1, linewidths=.2)
plt.show()

In [None]:
df.shape

In [None]:
df.describe().loc[['mean', 'std', 'min', '50%', 'max']]

In [None]:
df.to_csv('/kaggle/working/Data.csv')

# Feature Engineering


In [None]:
Subs = df.Subscribers.values.tolist()
Subs_accumulated = []
count = 0
for s in Subs:
    count += s
    Subs_accumulated.append(count)

In [None]:

df['Subs_accumulated'] = Subs_accumulated
df.head(2)

In [None]:

plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Subs_accumulated', y='Revenue')
plt.title('Revenue by Total Subscribers', fontsize=20)
plt.show()
     

In [None]:
print(df[['Subs_accumulated','Revenue']])

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Impressions', y='Revenue')
plt.title('Revenue by Impressions', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=df, x='Subscribers', y='Revenue')
plt.title('Revenue by Subscribers', fontsize=20)
plt.show()

Impressions vs Revenue is better fitting than Subs_accumulated vs Revenue

In [None]:
Video_pub = df.Videos_published.values.tolist()
Videos = []
count = 0
for v in Video_pub:
    count += v
    Videos.append(count)

In [None]:

df['Videos'] = Videos
df.head(2)

In [None]:
df.Videos_published.value_counts()

In [None]:
df.Videos.value_counts()

In [None]:

f, ax = plt.subplots(1, 1, figsize=(24, 6))

ax = sns.lineplot(data=df, x='Date', y='Revenue', label='Revenue', lw=1.5)
ax = sns.lineplot(data=df, x='Date', y='Videos', label='Videos', color='g', lw=2, linestyle='-')
plt.axvline(df.Date[0], color='y', label='Video Published', lw=0.5)

for i in range(1, len(df)):
    if df.Videos_published[i] == 1:
        plt.axvline(df.Date[i], color='y', lw=0.5)

plt.axhline(df.Revenue.mean(), color='c', linestyle='--', linewidth=2, label='Mean of Revenue')
plt.annotate('Mean of Revenue: $ {}'.format(round(df.Revenue.mean(),4)), 
             (df.Date[5], 30), 
             fontsize=12, 
             color='c')

plt.title('YouTube Revenue by Date', fontsize=20)
plt.ylabel('Revenue (USD, $)')
plt.xticks(fontsize=15)
ax.legend(fontsize='xx-large')
plt.show()

In [None]:

#df = df[(df['Date'] > '20200901')].reset_index(drop=True)
df

# Check Variance Inflation Factor for Colinearity

In [None]:
from sklearn.linear_model import LinearRegression

def calculate_vif(df, features):    
    vif, tolerance = {}, {}
    # all the features that you want to examine
    for feature in features:
        # extract all the other features you will regress against
        X = [f for f in features if f != feature]        
        X, y = df[X], df[feature]
        # extract r-squared from the fit
        r2 = LinearRegression().fit(X, y).score(X, y)                
        
        # calculate tolerance
        tolerance[feature] = 1 - r2
        # calculate VIF
        vif[feature] = 1/(tolerance[feature])
    # return VIF DataFrame
    return pd.DataFrame({'VIF': vif, 'Tolerance': tolerance})

In [None]:
cell_hover = {  # for row hover use  instead of 
    'selector': 'td:hover',
    'props': [('background-color', 'green')]
}

In [None]:
df.corr()['Revenue'].sort_values(ascending=False).reset_index().style.set_table_styles([cell_hover])

In [None]:
df.corr()['Revenue'].sort_values(ascending=False).reset_index()['index'][1:14].values

Checking VIF (if input columns are highly related with each other)


In [None]:
features_chosen=[
                #  'Watch_hours', 
                #  'Unique_viewers', 
                #  'Views', 
                #  'Likes', 
                #  'Dislikes',
                #  'Impressions', 
                 'Subs_accumulated',
                #  'Subscribers', 
                 'Comments', 
                 'Shares',
                #  'Videos',
                 'Average_view_sec', 
                #  'Average_views_per_viewer', 
                 'Average_viewed'
                 ]
calculate_vif(df.iloc[:,1:], features=features_chosen).style.apply(lambda x: ['background:green' if v < 0.2 else '' for v in x], subset=['VIF'])


VIF > 5 suggests that they are feature is highly realted 


In [None]:
cols_remove = [
                    'Date',
                # 'Watch_hours', 
                #  'Unique_viewers', 
                #  'Views', 
                 'Dislikes',
                # 'Impressions', 
                # 'Subscribers', 
                 'Comments', 
                 'Shares',
                 'Videos',
                 'Average_view_sec', 
                 'Average_views_per_viewer', 
                 'Average_viewed',
                 'Videos_published',
                 'Click_rate', 
                 'Subs_accumulated',
                 ]


df_new = df.copy()
df_new = df_new.drop(cols_remove, axis=1)
print('New Data Shape: ', df_new.shape)
df_new.head(2)

In [None]:
df_new.corr()['Revenue']

In [None]:
sns.pairplot(df_new, x_vars=df_new.columns.drop('Revenue'), y_vars=['Revenue'])
plt.title('Correlations with Target', fontsize=15)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train = df_new[:int(len(df_new)*0.8)]
test = df_new[int(len(df_new)*0.8):]
train.shape, test.shape

In [None]:
target = 'Revenue'

X_train = train.drop(target, axis=1)
y_train = train[target]
X_test = test.drop(target, axis=1)
y_test = test[target]

X_train.shape, y_train.shape

**F-statistic & P-Values**

In [None]:
from sklearn.feature_selection import f_regression

pvalue_table = pd.DataFrame(f_regression(X_train, y_train), columns=X_train.columns).T.rename(columns={0:'F_statistic', 1:'p_values'})
pvalue_table.style.apply(lambda x: ["background: green" if v < 0.05 else "" for v in x], axis = 1)
     

**Higher F statis and lower p value suggests that null hypothesis that the predictors have no effect** 

**BaseLine Model**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

**R^2 values range from 0 to 1. A value of 1 indicates a perfect fit**

In [None]:
def evaluate(title, y, y_):
    mae = mean_absolute_error(y, y_)
    rmse = mean_squared_error(y, y_)**0.5
    r2 = r2_score(y, y_)
    print('*{} Result*'.format(title))
    print('='*50)
    print('MAE Score: $', mae)
    print('RMSE Score: $', rmse)
    print('R2 Score: ', r2)
    print()

# Model Selection


**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [None]:
pipe_lin = make_pipeline(
    StandardScaler(),
    LinearRegression(n_jobs=-1)
)

pipe_lin.fit(X_train, y_train)
y_pred = pipe_lin.predict(X_test)
evaluate('Linear Regression', y_test, y_pred)

**Log Transformed Linear Regression**

In [None]:
from sklearn.compose import TransformedTargetRegressor

In [None]:
tt_lin = TransformedTargetRegressor(regressor=pipe_lin,
                                func=np.log1p, inverse_func=np.expm1)

tt_lin.fit(X_train, y_train)
y_pred = tt_lin.predict(X_test)
evaluate('Log Transformed Linear Regression', y_test, y_pred)

**Ridge**

In [None]:
pipe_ridge = make_pipeline(
    StandardScaler(),
    RidgeCV(alphas=np.arange(100, 105, 0.01), cv=5)
)

tt_ridge = TransformedTargetRegressor(regressor=pipe_ridge,
                                func=np.log1p, inverse_func=np.expm1)

tt_ridge.fit(X_train, y_train)
y_pred = tt_ridge.predict(X_test)
print('Best alpha: ', tt_ridge.regressor_.named_steps['ridgecv'].alpha_)
evaluate('Log Transformed Ridge', y_test, y_pred)

**Lasso**

In [None]:
pipe_lasso = make_pipeline(
    StandardScaler(),
    LassoCV(alphas=np.arange(0, 0.2, 0.001), cv=5, random_state=33)
)

tt_lasso = TransformedTargetRegressor(regressor=pipe_lasso,
                                func=np.log1p, inverse_func=np.expm1)

tt_lasso.fit(X_train, y_train)
y_pred = tt_lasso.predict(X_test)
print('Best alpha: ', tt_lasso.regressor_.named_steps['lassocv'].alpha_)
evaluate('Log Transformed Lasso', y_test, y_pred)

**ElasticNet**

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
pipe_elnet = make_pipeline(
    StandardScaler(),
    ElasticNetCV(n_jobs=-1, cv=5, random_state=33)
)

tt_elnet = TransformedTargetRegressor(regressor=pipe_elnet,
                                func=np.log1p, inverse_func=np.expm1)

tt_elnet.fit(X_train, y_train)
y_pred = tt_elnet.predict(X_test)
evaluate('Log Transformed ElasticNet', y_test, y_pred)

**RandomForest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
pipe_rfreg = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(n_estimators=300, n_jobs=-1, random_state=33)
)

tt_rfreg = TransformedTargetRegressor(regressor=pipe_rfreg,
                                func=np.log1p, inverse_func=np.expm1)

tt_rfreg.fit(X_train, y_train)
y_pred = tt_rfreg.predict(X_test)
evaluate('Log Transformed RandomForest Regressor', y_test, y_pred)

**XGB Regressor**

In [None]:
from xgboost import XGBRegressor

In [None]:
pipe_xgb = make_pipeline(
    StandardScaler(),
    XGBRegressor(learning_rate=0.1,
                  n_estimators=500,
                  n_jobs=-1,
                  random_state=33)
)

tt_xgb = TransformedTargetRegressor(regressor=pipe_xgb,
                                func=np.log1p, inverse_func=np.expm1)

tt_xgb.fit(X_train, y_train)
y_pred = tt_xgb.predict(X_test)
evaluate('Log Transformed XGB Regressor', y_test, y_pred)

**Light Gradient Boost Machine Regressor**

In [None]:
from lightgbm import LGBMRegressor

In [None]:
pipe_lgb = make_pipeline(
    StandardScaler(),
    LGBMRegressor(learning_rate=0.1,
                  n_estimators=500,
                  n_jobs=-1,
                  random_state=33)
)

tt_lgb = TransformedTargetRegressor(regressor=pipe_lgb,
                                func=np.log1p, inverse_func=np.expm1)

tt_lgb.fit(X_train, y_train)
y_pred = tt_lgb.predict(X_test)
evaluate('Log Transformed Light GBM Regressor', y_test, y_pred)

**Model Combined**

 So far best models are simple linear regresssion and Random Forest 

In [None]:
y_pred = pipe_lin.predict(X_test)*0.4 +tt_rfreg.predict(X_test)*0.6
evaluate('[Random forest + Simple linear regression]', y_test, y_pred)

 So far best models are simple linear regresssion and Random Forest and XGB

In [None]:
y_pred = pipe_lin.predict(X_test)*0.2 +tt_rfreg.predict(X_test)*0.6 + tt_xgb.predict(X_test)*0.2
evaluate('[Random forest + Simple linear regression + XGB]', y_test, y_pred)

# Testing on sheet2

In [None]:
dfTest = pd.read_csv('/kaggle/input/youtube-revenue-prediction/sheet2.csv')
dfTest.head()

In [None]:
col_map ={'a':'Date',
          'b':'Average_views_per_viewer',
          'c':'Engagement_score',
          'g':'Impression_score',
          'd':'Unique_viewers',
          'p':'Average_viewed',
          'i':'Shares',
          'j':'Likes_vs_dislike',
          'k':'Content_viewability',
          'f':'Impressions',
          'l':'Dislikes',
          'm':'Subscribers_lost',
          'n':'Subscribers_gained',
          'o':'Like',
          't':'Views',
          'r':'Videos_added',
          'q':'Videos_published', 
          's':'Subscribers',
          'e':'Click_rate',
          'h':'Comments',
          'u':'Watch_hours',
          'v':'Average_view_sec',
          'target':'Revenue'}
dfTest = dfTest.rename(columns=col_map)
dfTest.head()


In [None]:
dfTest = dfTest.drop(['Engagement_score', 
              'Impression_score', 
              'Subscribers_lost',
              'Subscribers_gained',
              'Videos_added',
              'id', 
              'Likes_vs_dislike', 
              'Content_viewability',
             ], axis=1)

In [None]:
dfTest.shape
dfTest.head(2)

**EDA on sheet2**

In [None]:
dfTest.select_dtypes(include=('object')).head(2)

In [None]:
cols_remove = [
                    'Date',
                # 'Watch_hours', 
                #  'Unique_viewers', 
                #  'Views', 
                 'Dislikes',
                # 'Impressions', 
                # 'Subscribers', 
                 'Comments', 
                 'Shares',
                 #'Videos',
                 'Average_view_sec', 
                 'Average_views_per_viewer', 
                 'Average_viewed',
                 'Videos_published',
                 'Click_rate', 
                 #'Subs_accumulated',
                 ]
dfTest = dfTest.drop(cols_remove, axis=1)
print('New Data Shape: ', dfTest.shape)
dfTest.head(2)

In [None]:
train = df_new
test = dfTest
train.shape, test.shape

In [None]:
target = 'Revenue'

X_train = train.drop(target, axis=1)
y_train = train[target]
X_test = test

X_train.shape, y_train.shape, X_test.shape

**Training data on entire sheet 1**

In [None]:
pipe_lin.fit(X_train, y_train)
tt_rfreg.fit(X_train, y_train)
tt_xgb.fit(X_train, y_train)

**Prediction on sheet2**

In [None]:
y_pred = pipe_lin.predict(X_test)*0.2 +tt_rfreg.predict(X_test)*0.6 + tt_xgb.predict(X_test)*0.2

In [None]:
y_pred