In [25]:
import numpy as np
import pandas as pd
import tensorflow as tf
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GlobalMaxPool1D, Dropout
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

In [26]:
df = pd.read_csv('finaldf.csv.gzip', compression='gzip')

In [27]:
df.category = df.category.replace(["ARTS",'CULTURE & ARTS'], "ARTS & CULTURE")\
.replace(['THE WORLDPOST', 'WORLDPOST'], 'WORLD NEWS')\
.replace(['STYLE'], 'STYLE & BEAUTY')\
.replace(['TASTE'], 'FOOD & DRINK')\
.replace(['WELLNESS'], 'HEALTHY LIVING')\
.replace(['PARENTING', 'PARENTS'], 'HOME & LIVING')\
.replace(['MONEY'], 'BUSINESS')\
.replace(['COLLEGE'], 'EDUCATION')\
.replace(['TECH', 'SCIENCE', 'ENVIRONMENT', 'GREEN'], 'TECH & SCIENCE')\
.replace(['WEDDINGS', 'DIVORCE'], 'WEDDINGS & DIVORCE')\
.replace(["WOMEN", "BLACK VOICES", 'QUEER VOICES', 'LATINO VOICES'], 'DIVERSE VOICES')\
.replace(['COMEDY'], 'ENTERTAINMENT')
df = df[~df.category.isin(['WEIRD NEWS', 'IMPACT', 'GOOD NEWS', 'FIFTY'])]
df = df.drop('combined', axis=1)

In [28]:
y = df.category
X = df.cleaned


In [29]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(df['category'])
encoded_Y = encoder.transform(df['category'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [30]:
rnn_X_train, rnn_X_test, rnn_y_train, rnn_y_test = train_test_split(df['cleaned'],
                                                                   dummy_y, random_state=42)

In [31]:
with open('article_seq.pkl','rb') as f:
    article_seq = pickle.load(f)

In [32]:
with open('embedding_matrix.pkl','rb') as f:
    embedding_matrix = pickle.load(f)

In [33]:
with open('val_seq.pkl','rb') as f:
    val_seq = pickle.load(f)

In [None]:
from tensorflow.keras.metrics import FalseNegatives
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras import Sequential
embedding_size = 128
top_words=1500
max_len=1000
model = Sequential()
model.add(Embedding(top_words + 1, embedding_size, input_length=max_len, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.25))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dense(128, activation='tanh'))
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
model.add(Dense(18, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

model.fit(article_seq, rnn_y_train, validation_data=(val_seq, rnn_y_test), epochs=5, batch_size=128)

# Share
# Improve this answer
# Follow 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [None]:
# serialize model to YAML
model_yaml = model.to_yaml()
with open("model3.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model3.h5")
print("Saved model to disk")

In [None]:
import matplotlib.pyplot as plt
def plot_results(model):
    fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))

    ax1.plot(model.history.epoch, model.history.history['loss'], label='train')
    ax1.plot(model.history.epoch, model.history.history['val_loss'], label='test')
    
    ax1.legend()
    
    ax2.plot(model.history.epoch, model.history.history['categorical_accuracy'], label='train')
    ax2.plot(model.history.epoch, model.history.history['val_categorical_accuracy'], label='test')

    ax2.legend()
    
    
plot_results(model)

In [None]:
len(article_seq)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.base import ClassifierMixin

class SKWrapper (ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = list(range(0,18))
    def predict(self, X):
        return self.model.predict_classes(X)
sk_model = SKWrapper(model)
fig, ax = plt.subplots(figsize=(20,20))
plot_confusion_matrix(sk_model, val_seq[:1000], np.argmax(rnn_y_train[:1000], axis=1), ax=ax)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.base import ClassifierMixin

class SKWrapper (ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = list(range(0,18))
    def predict(self, X):
        return self.model.predict_classes(X)
sk_model = SKWrapper(model)
fig, ax = plt.subplots(figsize=(20,20))
plot_confusion_matrix(sk_model, article_seq[:1000], np.argmax(rnn_y_train[:1000], axis=1), ax=ax)

In [None]:
np.argmax(rnn_y_train, axis=1)