In [33]:
import numpy as np
import pandas as pd
import tensorflow as tf
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GlobalMaxPool1D, Dropout, GRU
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

In [18]:
import numpy as np
import pandas as pd
from ratelimit import limits, RateLimitException
from backoff import on_exception, expo
import requests
from newspaper import Article
from newspaper.article import ArticleException

##   Dataset imported from [Kaggle](https://www.kaggle.com/rmisra/news-category-dataset)

In [19]:
import pandas as pd

df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

In [13]:
article_text = pd.DataFrame()

Updating saved json file


## Used [Article3k](https://newspaper.readthedocs.io/en/latest/) to scrape each URL's full article text 

### this step takes approximately 48 hours on one machine

In [15]:
import time
article_text =  []
article_authors = []
article_keywords = []
x=0
for i in df.link:
    try:
        article = Article(i)
        article.download()
        article.parse()
        article_text.append(article.text.replace('\n', ' '))
        article_authors.append(article.authors)
        article_keywords.append(article.keywords)
    except ArticleException:
        article_text.append(np.nan)
        article_authors.append(np.nan)
        article_keywords.append(np.nan)
        print('nan found')
    x+=1
    if x %2 == True:
        time.sleep(2)
        print(x, 'done')

1 done


KeyboardInterrupt: 

## assigning scraped text to new column in dataframe

In [22]:
df['full_text'] = article_text

NameError: name 'dffulltext' is not defined

## Exporting Target, Full Text, and Headline columns to a .csv

In [None]:
df = df[['category','full_text', 'headline']]

## Combining article categories to reduce number of classes to predict for

In [12]:
df.category = df.category.replace(["ARTS",'CULTURE & ARTS'], "ARTS & CULTURE")\
.replace(['THE WORLDPOST', 'WORLDPOST'], 'WORLD NEWS')\
.replace(['STYLE'], 'STYLE & BEAUTY')\
.replace(['TASTE'], 'FOOD & DRINK')\
.replace(['WELLNESS'], 'HEALTHY LIVING')\
.replace(['PARENTING', 'PARENTS'], 'HOME & LIVING')\
.replace(['MONEY'], 'BUSINESS')\
.replace(['COLLEGE'], 'EDUCATION')\
.replace(['TECH', 'SCIENCE', 'ENVIRONMENT', 'GREEN'], 'TECH & SCIENCE')\
.replace(['WEDDINGS', 'DIVORCE'], 'WEDDINGS & DIVORCE')\
.replace(["WOMEN", "BLACK VOICES", 'QUEER VOICES', 'LATINO VOICES'], 'DIVERSE VOICES')\
.replace(['COMEDY'], 'ENTERTAINMENT')
df = df[~df.category.isin(['WEIRD NEWS', 'IMPACT', 'GOOD NEWS', 'FIFTY'])]

In [13]:
df.category.value_counts()

HEALTHY LIVING        20049
POLITICS              17239
HOME & LIVING         13413
ENTERTAINMENT         11985
STYLE & BEAUTY        10050
DIVERSE VOICES         8691
TRAVEL                 8354
FOOD & DRINK           6983
WEDDINGS & DIVORCE     5852
BUSINESS               4770
WORLD NEWS             4761
TECH & SCIENCE         4723
ARTS & CULTURE         2390
SPORTS                 2201
CRIME                  1645
MEDIA                  1429
RELIGION               1281
EDUCATION              1089
Name: category, dtype: int64

## Specifying text containing html tags to remove from corpus

In [None]:
pictwitter = df.combined.loc[df.combined.str.startswith("pictwitter")]
http = df.combined.loc[df.combined.str.contains("http")]
www = df.combined.loc[df.combined.str.startswith("www") == True]

In [None]:
from tqdm import tqdm

stop = stopwords.words('english') + list(pictwitter) + list(http) + list(www)
def clean_history(history):
    history = re.sub('<a\b[^>]*>(.*?)</a>', 
       '',history)
    punct_translator=str.maketrans('','',string.punctuation.replace('.', '') + '―“”’')
    digit_translator=str.maketrans('','',string.digits)
    history=history.translate(punct_translator)
    history=history.translate(digit_translator)
    split = history.split()
    history = " ".join([wn.lemmatize(word.lower()) for word in split if word.lower() not in stop and '.com' not in word])
#     tokens = re.split('\W+', history)
    #history = [snow_stemmer.stem(word) for word in tokens if word not in stop]
#     history = [wn.lemmatize(word) for word in tokens if word not in stop]
#     history = ' '.join(history)
    return history

df['cleaned'] = ''
with tqdm(total=len(df)) as pbar:
    for idx in df.index:
        df.at[idx, 'cleaned'] = clean_history(df.combined[idx])
        pbar.update(1)

## Replacing combined column with cleaned column

In [None]:
df = df.drop('combined', axis=1)

## Generating Baseline RFC

In [None]:
y = df.category
X = df.cleaned


## TFIDF vectorizer for Random Forest

In [6]:
# generate tf-idf vectorization (use sklearn's TfidfVectorizer) for our data
def tfidf(X, y,  stop): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    vectorizer = TfidfVectorizer(stop_words=stop)
    tf_idf_train = vectorizer.fit_transform(X_train)
    tf_idf_test = vectorizer.transform(X_test)
    return tf_idf_train, tf_idf_test, y_train, y_test, vectorizer

## Calling vectorizer and removing stopwords

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
idf_train, idf_test, y_tr, y_test, vectorizer = tfidf(X, y, stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnsimmons/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## SMOTE according 3 nearest neighbors to address class imbalance

In [53]:
sm = SMOTE(k_neighbors = 3, random_state=42)
#fit resample just training data. The testing observations presumably do not require smoting
X_tr_res, y_tr_res = sm.fit_resample(idf_train, y_tr)


## Random Forest Classifier with 50 estimators, 3k max leaf nodes, 30 sample minimum, and a max depth of 1000

In [54]:
rfc = RandomForestClassifier(class_weight='balanced', n_estimators=50, max_leaf_nodes=3000, min_samples_split=30, max_depth=1000, n_jobs=-1)


In [55]:
def classify_text(classifier, tf_idf_train, tf_idf_test, y_train):
    classifier.fit(tf_idf_train, y_train)
    train_preds = classifier.predict(tf_idf_train)
    test_preds = classifier.predict(tf_idf_test)
    return train_preds, test_preds

def score_preds(y_test,y_train,test_preds, train_preds):
    print("Train Acc: ", accuracy_score(y_train, train_preds))
    print("Test Acc: ", accuracy_score(y_test, test_preds))

In [56]:
#Call function using results of vectorize function using RF model
rf_train_preds, rf_test_preds = classify_text(rfc, X_tr_res, idf_test, y_tr_res)

#Call function for score
score_preds(y_test, y_tr_res, rf_test_preds, rf_train_preds)

Train Acc:  0.8804823348014371
Test Acc:  0.687458631449554


## Using Keras 

### Label Encoding the target classes

In [15]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(df['category'])
encoded_Y = encoder.transform(df['category'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [16]:
rnn_X_train, rnn_X_test, rnn_y_train, rnn_y_test = train_test_split(df['cleaned'],
                                                                   dummy_y, random_state=42)

## Tokenizing each article's text and padding each article to 1000 words

In [17]:
articles = rnn_X_train.to_list()
val_articles = rnn_X_test.to_list()
target = rnn_y_train
val_target = rnn_y_test
top_words = 1500
tokenizer = Tokenizer(num_words=top_words)

# second, we update the vocabulary of the tokenizer by providing text
tokenizer.fit_on_texts(articles) # fit on reviews before converting to matrix

# third, we transform each text into a sequence of integers
article_seq = tokenizer.texts_to_sequences(articles)
val_seq = tokenizer.texts_to_sequences(val_articles)
len(tokenizer.word_index) # length of tokernizer, means unique words in the vocab

# in order for the math to work, we convert each review into same length. Padding...
max_len = 1000
article_seq = pad_sequences(article_seq, maxlen=max_len, truncating='post')
val_seq = pad_sequences(val_seq, maxlen=max_len, truncating='post')


# word2vec requires list of lists as input
documents = []
i = 0
for doc in articles:
    i += 1
    documents.append(doc.split(' '))
    

## Using gensim to translate each word of each article into a 128 number vector

### This step takes approximately an hour to run

In [18]:
# gensim model training, it will translate a word into 128 numbers

embedding_size = 128
window_size = 15
w2v = Word2Vec(sentences=documents, # input list of lists
               min_count=3 # any word must appear 3 times or more for training
#                , workers=3 # CPU cores to be used for training the model
               , vector_size=embedding_size # no. of numbers required to represent a word
               , window=window_size # how many neighbors to look at either side of the word for learning
               , epochs=20 # no. of iterations over the documents for training
              )

## Creating embedding matrix from vectors to determine weights used in embedding layer of model

In [19]:
w2v.wv.vectors.shape # shape of the model
# create embeddings to be added into keras model
embedding_matrix = np.zeros((top_words + 1, embedding_size))
# build matrix from pre-trained word to vec model
for word, i in sorted(tokenizer.word_index.items(), key=lambda x: x[1]):
    if i > top_words:
        break
    if word in w2v.wv.key_to_index:
        embedding_vector = w2v.wv[word]
        embedding_matrix[i] = embedding_vector

## Creating multi-layer sequential model with bidirectional, dense, and dropout layers

### this step takes approximately 3 hours to run

In [43]:
from tensorflow.keras.metrics import FalseNegatives
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras import Sequential


model = Sequential()
model.add(Embedding(top_words + 1, embedding_size, input_length=max_len, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=False)))

model.add(Dropout(0.3))
model.add(Dense(32, activation='softmax'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))

model.add(Dense(18, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

model.fit(article_seq, rnn_y_train, validation_data=(val_seq, rnn_y_test), epochs=5, batch_size=128)

# Share
# Improve this answer
# Follow 

Epoch 1/5
 12/744 [..............................] - ETA: 26:37 - loss: 2.8396 - categorical_accuracy: 0.0970

KeyboardInterrupt: 

## Plots of the model's loss and accuracy over its epochs

In [None]:
import matplotlib.pyplot as plt
def plot_results(model):
    fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))

    ax1.plot(model.history.epoch, model.history.history['loss'], label='train')
    ax1.plot(model.history.epoch, model.history.history['val_loss'], label='test')
    
    ax1.legend()
    
    ax2.plot(model.history.epoch, model.history.history['categorical_accuracy'], label='train')
    ax2.plot(model.history.epoch, model.history.history['val_categorical_accuracy'], label='test')

    ax2.legend()
    
    
plot_results(model)

In [112]:
len(article_seq)

95178

## Confusion matrix of validation data

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.base import ClassifierMixin

class SKWrapper (ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = list(range(0,18))
    def predict(self, X):
        return self.model.predict_classes(X)
sk_model = SKWrapper(model)
fig, ax = plt.subplots(figsize=(20,20))
plot_confusion_matrix(sk_model, val_seq[:1000], np.argmax(rnn_y_train[:1000], axis=1), ax=ax)

## Confusion matrix of training data

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.base import ClassifierMixin

class SKWrapper (ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = list(range(0,18))
    def predict(self, X):
        return self.model.predict_classes(X)
sk_model = SKWrapper(model)
fig, ax = plt.subplots(figsize=(20,20))
plot_confusion_matrix(sk_model, article_seq[:1000], np.argmax(rnn_y_train[:1000], axis=1), ax=ax)

In [127]:
np.argmax(rnn_y_train, axis=1)

array([14,  8,  4, ...,  2,  4,  9])