In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

#Load the data
data = pd.read_csv("C:\\Users\\croma\\Dexlab python\\Atul_Projects\\OnlineNewsPopularity.csv")

#drop Url column
data.drop(['url'],axis=1,inplace=True)

#Feature Selection.
data_new=data.copy()
A=data_new.iloc[:,0:59]
B=data_new.iloc[:,-1]


bestfeature = SelectKBest(score_func=f_regression, k=10)
fit = bestfeature.fit(A,B)
dfscore= pd.DataFrame(fit.scores_)
dfcolumns= pd.DataFrame(A.columns)
featurescores = pd.concat([dfcolumns,dfscore],axis=1)
featurescores.columns =['features','score']
featurescores.nlargest(30,"score")

#make New dataset
df=data_new[["kw_avg_avg","LDA_03","kw_max_avg","LDA_02","self_reference_avg_sharess","self_reference_min_shares","data_channel_is_world",
      "self_reference_max_shares","num_hrefs","kw_avg_max","kw_min_avg","num_imgs","avg_negative_polarity","global_subjectivity",
        "kw_avg_min","kw_max_min","abs_title_sentiment_polarity","num_videos","average_token_length","title_subjectivity","num_keywords",
        "max_negative_polarity","min_negative_polarity","data_channel_is_entertainment","is_weekend","LDA_04","weekday_is_saturday",
        "data_channel_is_tech","rate_positive_words","title_sentiment_polarity","shares"]]

#Train_Test_Split
Y=df['shares']
X=df.drop("shares",axis=1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,random_state=42)

#Making the model with the Linear Regression.
pipelines = []
names = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',
 LinearRegression())])))
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring="neg_mean_squared_error")
    score=np.sqrt(-cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, score.mean(), score.std())
    print(msg)

ScaledLR: 10824.586082 (4393.181728)


tune model

In [9]:
from sklearn.model_selection import GridSearchCV
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
param_grid={'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
model = LinearRegression()
kfold = KFold(n_splits=10, random_state=7)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring="neg_mean_squared_error", cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print(grid_result.best_estimator_)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)


In [11]:
go=grid_result.predict(X_test)
root=mean_squared_error(Y_test,go)
root1=np.sqrt(root)
root1

10836.410654792713

In [16]:
Y_test


32340     2900
10480     1300
15370    17700
31592     1500
198       1400
         ...  
31669     2200
34677     1400
15785      528
23738     3200
38254     1400
Name: shares, Length: 7929, dtype: int64

In [13]:
from joblib import dump, load
dump(grid, 'filename.pkl') 

['filename.pkl']

In [18]:
grid = load('filename.pkl') 
grid.predict(X_test)

array([2677.22211721, 2965.74033953, 5963.61196052, ..., 2204.77499727,
       2648.16655918, 2754.89072964])