## Import Dependencies and the Data 

In [None]:
# set the seed value for reproducible results
from numpy.random import seed
seed(11)

In [None]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import tensorflow
tensorflow.keras.__version__

In [None]:
# bring in the data
df = pd.read_csv('../Data/threecleaneddata.csv')
pd.set_option('display.max_columns', None)
df.head()

## Data Pre-Processing

In [None]:
X = df.drop(['Unnamed: 0', 'url', 'Category', 'Shares'], axis=1)
y = df['Category']
print(X.shape, y.shape)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
print(X_train_scaled.shape,X_test_scaled.shape)

In [None]:
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_test_categorical

## Create a Deep Learning Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='elu', input_dim=58))
model.add(Dense(units=100, activation='elu'))
model.add(Dense(units=3, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

## Feature Importance

In [None]:
# Credit to https://romainlhardy.medium.com/estimating-feature-importance-the-easy-way-2ebe970c600c
# for this feature importance code
y_pred = model.predict_classes(X_test_scaled)
acc = accuracy_score(y_test, y_pred)

In [None]:
def get_feature_importance(j, n):
  s = accuracy_score(y_test, y_pred) # baseline score
  total = 0.0
  for i in range(n):
    perm = np.random.permutation(range(X_test_scaled.shape[0]))
    X_test_ = X_test_scaled.copy()
    X_test_[:, j] = X_test_scaled[perm, j]
    y_pred_ = model.predict_classes(X_test_)
    s_ij = accuracy_score(y_test, y_pred_)
    total += s_ij
  return s - total / n

In [None]:
# fair warning: This cell took about 25 minutes to run
f = []
for j in range(X_test_scaled.shape[1]):
  f_j = get_feature_importance(j, 100)
  f.append(f_j)
# Plot
plt.figure(figsize=(16, 5))
plt.bar(range(X_test_scaled.shape[1]), f, color="r", alpha=0.7)
plt.xticks(ticks=range(X_test.shape[1]))
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature importances")
plt.savefig('../graphics/NN_graphics/feature_importance.png')
plt.show()


In [None]:
# list of features and importance score
cols = list(X.columns)
feat_imp = list(zip(cols,f))
feat_imp

## Model with Top 40 Features

In [None]:
X = df.drop(['Unnamed: 0', 'url', 'Category', 'Shares', 'Unique_Word_Rate', 'Non_Stop_Word_Rate', 'Unique_Non_Stop_Word_Rate', 'Number_of_Mashable_Links', 'Images', 'Lifestyle', 'Business', 'Best_Keyword_Min_Shares', 'Avg_Keyword_Max_Shares', 'Max_Ref_Mashable_Article_Shares', 'Monday', 'Tuesday', 'Closeness_to_LDA_4', 'Text_Subjectivity', 'Global_Positive_Word_Rate','Minimum_Positive_Polarity','Minimum_Negative_Polarity','Absolute_Polarity_Level'], axis=1)

y = df['Category']
print(X.shape, y.shape)

In [None]:
pd.set_option('display.max_columns', None)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
print(X_train_scaled.shape,X_test_scaled.shape)

In [None]:
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_test_categorical

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='elu', input_dim=40))
model.add(Dense(units=100, activation='elu'))
model.add(Dense(units=3, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

## Model with Top 6 Feaures

In [None]:
X = df[['Title_Word_Count', 'Content_Word_Count', 'Number_of_Links', 'Entertainment', 'Social_Media', 'Avg_Keyword_Avg_Shares']]
y = df['Category']
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
print(X_train_scaled.shape,X_test_scaled.shape)

In [None]:
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_test_categorical

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='elu', input_dim=6))
model.add(Dense(units=100, activation='elu'))
model.add(Dense(units=3, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

## Accuracy and Loss Curves

In [None]:
history = model.fit(
    X_train_scaled,
    y_train_categorical,
    validation_data=(X_test_scaled, y_test_categorical),
    epochs=60,
    shuffle=True,
    verbose=2,
    batch_size = 20
)

In [None]:
# credit to https://vitalflux.com/python-keras-learning-validation-curve-classification-model/
# for the code to create these graphs
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
accuracy = history_dict['acc']
val_accuracy = history_dict['val_acc']
 
epochs = range(1, len(loss_values) + 1)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
#
# Plot the model accuracy vs Epochs
#
ax[0].plot(epochs, accuracy, 'b', label='Training')
ax[0].plot(epochs, val_accuracy, 'r', label='Testing')
ax[0].set_title('Training & Testing Accuracy', fontsize=16)
ax[0].set_xlabel('Epochs', fontsize=16)
ax[0].set_ylabel('Accuracy', fontsize=16)
ax[0].legend()
#
# Plot the loss vs Epochs
#
ax[1].plot(epochs, loss_values, 'b', label='Training')
ax[1].plot(epochs, val_loss_values, 'r', label='Testing')
ax[1].set_title('Training & Testing Loss', fontsize=16)
ax[1].set_xlabel('Epochs', fontsize=16)
ax[1].set_ylabel('Loss', fontsize=16)
ax[1].legend()
plt.savefig('../graphics/NN_graphics/accuracy_loss_curves.png')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred,
                            target_names=["Unpopular", "Neutral", "Popular"]))
plt.savefig('../graphics/NN_graphics/classification_report.png')