In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
np.random.seed(123)

file_name = "data_new.csv"
df_data = pd.read_csv(file_name, sep=',')
df_data=df_data.drop(['title','tags'],axis=1)

df_data.rename(columns={'view_count': 'View'}, inplace=True)
df_data["View"] = df_data["View"].apply(lambda x: round(x/10000))
df_data = df_data[~(df_data['View']==0)]

df_data['trending_date']=[x[:7] for x in df_data['trending_date']]
df_data['publishedAt']=[x[:7] for x in df_data['publishedAt']]

print(df_data.shape)
df_data.head(2)

(139994, 10)


Unnamed: 0,publishedAt,channelTitle,categoryId,trending_date,View,likes,dislikes,comment_count,title_len,tag_num
0,2020-08,Brawadis,22,2020-08,151,156908,5855,35313,7,15
1,2020-08,Apex Legends,20,2020-08,238,146739,2794,16549,10,25


In [2]:
df_used = pd.get_dummies(df_data)
df_used.columns

Index(['categoryId', 'View', 'likes', 'dislikes', 'comment_count', 'title_len',
       'tag_num', 'publishedAt_2020-08', 'publishedAt_2020-09',
       'publishedAt_2020-10',
       ...
       'trending_date_2022-02', 'trending_date_2022-03',
       'trending_date_2022-04', 'trending_date_2022-05',
       'trending_date_2022-06', 'trending_date_2022-07',
       'trending_date_2022-08', 'trending_date_2022-09',
       'trending_date_2022-10', 'trending_date_2022-11'],
      dtype='object', length=5725)

In [3]:
df_X = df_used.drop(columns=['View'])
df_y = df_used[['View']]
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, 
                                                    test_size = 0.25,
                                                    random_state=123)

#Set baseline: use the median value as the prediction results
baseline = np.median(y_train)
baseline_mae = np.mean(abs(baseline - y_test))
baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))
print('Baseline on test data, MAE is %0.2f' %  baseline_mae)  
print('Baseline on test data, RMSE is %0.2f' %  baseline_rmse)   

Baseline on test data, MAE is 193.30
Baseline on test data, RMSE is 714.96


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [4]:
#Use linear regression model
feature_train = X_train[df_X.columns]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[df_X.columns]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 88.64
Using Linear Regression, RMSE is 269.98


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [None]:
#Use random forest regression model
from sklearn.ensemble import RandomForestRegressor
feature_train = X_train[df_X.columns]

regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(feature_train, y_train)
feature_test = X_test[df_X.columns]
predictions = regressor.predict(feature_test)

mae = np.mean(abs(predictions - np.array(y_test)))
rmse = np.sqrt(np.mean((predictions - np.array(y_test)) ** 2))
print('Using RF Regression, MAE is %0.2f' %  mae)  
print('Using RF Regression, RMSE is %0.2f' %  rmse)

  regressor.fit(feature_train, y_train)


In [5]:
feature_train = X_train[['title_len']]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[['title_len']]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 240.52
Using Linear Regression, RMSE is 699.60


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [7]:
feature_train = X_train[['tag_num']]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[['tag_num']]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 240.68
Using Linear Regression, RMSE is 699.62


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [10]:
feature_train = X_train[['categoryId']]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[['categoryId']]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 239.59
Using Linear Regression, RMSE is 697.12


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [11]:
feature_train = X_train[['comment_count']]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[['comment_count']]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 190.80
Using Linear Regression, RMSE is 474.78


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [12]:
feature_train = X_train[['likes']]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[['likes']]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 125.03
Using Linear Regression, RMSE is 350.00


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [13]:
feature_train = X_train[['dislikes']]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[['dislikes']]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse)

Using Linear Regression, MAE is 198.02
Using Linear Regression, RMSE is 613.01


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [26]:
#channelTitle
df_data2=df_data.drop(['trending_date','publishedAt'],axis=1)
df_used = pd.get_dummies(df_data2)

df_X2 = df_used2.drop(columns=['View'])
df_y2 = df_used2[['View']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_X2, df_y2, 
                                                    test_size = 0.25,
                                                    random_state=123)
df_X2 = df_X.drop(columns=['title_len','tag_num','likes','dislikes','comment_count'])

feature_train = X_train[df_X2.columns]

lr = LinearRegression()
lr.fit(feature_train, y_train)

feature_test = X_test[df_X2.columns]
predictions = lr.predict(feature_test)
mae = np.mean(abs(predictions - y_test))
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 146.64
Using Linear Regression, RMSE is 549.76


In [21]:
#published_time
df_data2=df_data.drop(['trending_date','title_len','tag_num','likes','dislikes','comment_count','categoryId','channelTitle'],axis=1)
df_used2 = pd.get_dummies(df_data2)

df_X2 = df_used2.drop(columns=['View'])
df_y2 = df_used2[['View']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_X2, df_y2, 
                                                    test_size = 0.25,
                                                    random_state=123)
feature_train2 = X_train2[df_X2.columns]

lr = LinearRegression()
lr.fit(feature_train2, y_train2)

feature_test2 = X_test2[df_X2.columns]
predictions2 = lr.predict(feature_test2)
mae = np.mean(abs(predictions2 - y_test2))
rmse = np.sqrt(np.mean((predictions2 - y_test2) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse) 

Using Linear Regression, MAE is 240.00
Using Linear Regression, RMSE is 698.95


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [22]:
#trending_date
df_data2=df_data.drop(['publishedAt','title_len','tag_num','likes','dislikes','comment_count','categoryId','channelTitle'],axis=1)
df_used2 = pd.get_dummies(df_data2)

df_X2 = df_used2.drop(columns=['View'])
df_y2 = df_used2[['View']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_X2, df_y2, 
                                                    test_size = 0.25,
                                                    random_state=123)
feature_train2 = X_train2[df_X2.columns]

lr = LinearRegression()
lr.fit(feature_train2, y_train2)

feature_test2 = X_test2[df_X2.columns]
predictions2 = lr.predict(feature_test2)
mae = np.mean(abs(predictions2 - y_test2))
rmse = np.sqrt(np.mean((predictions2 - y_test2) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse)

Using Linear Regression, MAE is 240.04
Using Linear Regression, RMSE is 699.18


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
