# Importing Dependancies

In [2]:
import numpy as np
import pandas as pd
import warnings
import copy
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

# Read in Dataset

In [3]:
# read data into dataframe
df = pd.read_csv("./OnlineNewsPopularity.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

# Preprocessing

### Split into data and target labels

In [4]:
y = df.iloc[:, 60]
x = df.iloc[:, 0:60]

### Independant Variables

In [5]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 60 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   url                             39644 non-null  object 
 1    timedelta                      39644 non-null  float64
 2    n_tokens_title                 39644 non-null  float64
 3    n_tokens_content               39644 non-null  float64
 4    n_unique_tokens                39644 non-null  float64
 5    n_non_stop_words               39644 non-null  float64
 6    n_non_stop_unique_tokens       39644 non-null  float64
 7    num_hrefs                      39644 non-null  float64
 8    num_self_hrefs                 39644 non-null  float64
 9    num_imgs                       39644 non-null  float64
 10   num_videos                     39644 non-null  float64
 11   average_token_length           39644 non-null  float64
 12   num_keywords                   

### Dependant Variables

In [6]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 39644 entries, 0 to 39643
Series name:  shares
Non-Null Count  Dtype
--------------  -----
39644 non-null  int64
dtypes: int64(1)
memory usage: 309.8 KB


### Drop Unused Features

In [7]:
# drop unused features
vals = [0, 1, 4, 5, 6]
for i in range(13, 39):
    vals.append(i)

x = x.drop(x.columns[vals], axis = 1)
x.head()

Unnamed: 0,n_tokens_title,n_tokens_content,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,LDA_00,LDA_01,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,4.0,2.0,1.0,0.0,4.680365,5.0,0.500331,0.378279,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,3.0,1.0,1.0,0.0,4.913725,4.0,0.799756,0.050047,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,3.0,1.0,1.0,0.0,4.393365,6.0,0.217792,0.033334,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,9.0,0.0,1.0,0.0,4.404896,7.0,0.028573,0.4193,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,19.0,19.0,20.0,0.0,4.682836,7.0,0.028633,0.028794,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


### Shuffle Data

In [8]:
np.random.seed(42)

In [9]:
# convert to numpy objects
x = x.to_numpy()
y = y.to_numpy()

In [11]:
# shuffle
arr = np.arange(len(df))
np.random.shuffle(arr)
x = x[arr]
y = y[arr]

### Splitting Data

In [12]:
x_train = x[:1000].copy()
y_train = y[:1000].copy()
x_val = x[1000:2000].copy()
y_val = y[1000:2000].copy()
x_test = x[2000:3000].copy()
y_test = y[2000:3000].copy()

### Standardize Data

In [13]:
# first save the training mean and std
training_mean = np.mean(x_train, axis=0)
training_std = np.std(x_train, axis=0)

# first standardize the training data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
# standardize the test and validation data using training mean and std
x_test = (x_test - training_mean) / training_std
x_val = (x_val - training_mean) / training_std