In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [2]:
df_path = '/content/drive/MyDrive/cv/modified_news_share_data.xlsx'
df = pd.read_excel(df_path)


In [3]:
print(df.shape)
df.head()

(1003, 29)


Unnamed: 0,article_id,title,text,published_date,unique_tokens_rate,num_hrefs,num_imgs,num_videos,average_token_length,num_keywords,...,title_subjectivity,title_sentiment_polarity,shares,article_length,published_day,text_sentiment,title_sentiment,text_length,month,day_of_week
0,7014291,It's Time for a YouTube Competitor,It was reported last week that Yahoo is gettin...,2014-03-31,0.959501,-0.442917,-0.257743,-0.052813,-0.034452,5,...,0.25,0.25,1600,-0.626876,Monday,0.121354,0.0,1037,3,0
1,7014327,The Ultimate Roundup of 2014's Biggest April F...,April Fools' marks the day when pranking goes ...,2014-04-01,0.191416,-0.508819,-0.493293,-0.298049,0.046973,7,...,0.0,0.0,1500,-0.835055,Tuesday,0.155208,0.0,504,4,1
2,7014304,A Selfie and One Garish Jacket: How the Red So...,The 2013 World Series champion Boston Red Sox ...,2014-04-01,0.424019,-0.245211,-0.493293,-0.298049,0.033781,7,...,0.0,0.0,1400,-0.058972,Tuesday,0.196855,0.0,2491,4,1
3,7031995,African Telecom Proposes More Diverse Emoticons,Less than a week after Apple pledged to make i...,2014-04-01,-0.008375,-0.508819,-0.493293,-0.298049,0.684209,9,...,0.0,0.0,1300,-0.484314,Tuesday,0.039671,0.25,1402,4,1
4,6541359,"Google to Launch Improved Android Camera App, ...",Android users may soon see a series of new fea...,2014-04-01,0.522163,-0.377015,0.448907,-0.298049,-0.381712,7,...,0.0,0.0,3000,-0.446818,Tuesday,0.103598,0.0,1498,4,1


In [4]:
# Dropping 'title' and 'text' columns
df = df.drop(['title', 'text', 'data_channel','published_day'], axis=1)

# Splitting the dataset into features and target variable
X = df.drop('shares', axis=1)
y = df['shares']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train['published_date'] = pd.to_datetime(X_train['published_date'])
X_test['published_date'] = pd.to_datetime(X_test['published_date'])


In [6]:
# X_train['year'] = X_train['published_date'].dt.year
# X_train['month'] = X_train['published_date'].dt.month
# X_train['day'] = X_train['published_date'].dt.day
# X_train = X_train.drop('published_date', axis=1)

# X_test['year'] = X_test['published_date'].dt.year
# X_test['month'] = X_test['published_date'].dt.month
# X_test['day'] = X_test['published_date'].dt.day
# X_test = X_test.drop('published_date', axis=1)


In [7]:
X_train['day_of_week'] = X_train['published_date'].dt.dayofweek
X_train['quarter'] = X_train['published_date'].dt.quarter
X_test['day_of_week'] = X_test['published_date'].dt.dayofweek
X_test['quarter'] = X_test['published_date'].dt.quarter


In [8]:
non_numeric_columns = X_train.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)


Non-numeric columns: Index(['published_date'], dtype='object')


In [9]:
X_train = X_train.drop(non_numeric_columns, axis=1)
X_test = X_test.drop(non_numeric_columns, axis=1)


**Linear Regression**

In [10]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


y_pred_linear = linear_model.predict(X_test)


mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)


rmse_linear = np.sqrt(mse_linear)

print(f"Linear Regression MSE: {mse_linear}")
print(f"Linear Regression RMSE: {rmse_linear}")
print(f"Linear Regression R2: {r2_linear}")


Linear Regression MSE: 2184262540.9190993
Linear Regression RMSE: 46736.09462630676
Linear Regression R2: -0.0038400935335030084


**Ridge regression**

In [12]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

y_pred_ridge = ridge_model.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression MSE: {mse_ridge}")
print(f"Ridge Regression RMSE: {rmse_ridge}")
print(f"Ridge Regression R2: {r2_ridge}")


Ridge Regression MSE: 2181260394.8242106
Ridge Regression RMSE: 46703.965514977535
Ridge Regression R2: -0.0024603717462923225


**Lasso Regression**

In [15]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)


y_pred_lasso = lasso_model.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression MSE: {mse_lasso}")
print(f"Lasso Regression RMSE: {rmse_lasso}")
print(f"Lasso Regression R2: {r2_lasso}")


Lasso Regression MSE: 2182637009.946576
Lasso Regression RMSE: 46718.700858934164
Lasso Regression R2: -0.0030930344538684373


**Ensemble**

In [18]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score


ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)

voting_regressor = VotingRegressor(estimators=[('ridge', ridge), ('lasso', lasso)])


voting_regressor.fit(X_train, y_train)


y_pred_voting = voting_regressor.predict(X_test)


mse_voting = mean_squared_error(y_test, y_pred_voting)
rmse_voting = np.sqrt(mse_voting)
r2_voting = r2_score(y_test, y_pred_voting)


print(f"Voting Regressor MSE: {mse_voting}")
print(f"Voting Regressor RMSE: {rmse_voting}")
print(f"Voting Regressor R2: {r2_voting}")


Voting Regressor MSE: 2181908667.3477764
Voting Regressor RMSE: 46710.90522937632
Voting Regressor R2: -0.002758303857703037
