In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
!pip install keras-tcn --no-dependencies --quiet

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import nltk
import string
import time
import pickle
import tqdm
import emoji
import io

pd.set_option("display.max_colwidth", None)

from wordcloud import WordCloud
from collections import Counter
from mlxtend.plotting import plot_confusion_matrix
from scikitplot.metrics import plot_roc_curve

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from keras.models import Model
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping
from tcn import TCN

from IPython.display import Markdown

def bold(string):
    display(Markdown("**" + string + "**"))

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
df = pd.read_csv("/kaggle/input/news-texts/news.csv")
df.head()

In [None]:
def df_stats(data):
    bold(" SHAPE ".center(50, "#"))
    print("ROWS: {}".format(data.shape[0]))
    print("COLS: {}".format(data.shape[1]))
    bold(" TYPES ".center(50, "#"))
    print(data.dtypes)
    bold(" MISSING VALUES ".center(50, "#"))
    print(data.isnull().sum())
    bold(" DUPLICATED VALUES ".center(50, "#"))
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    #bold(" DESCRIBE ".center(50, "#"))
    #print(data.describe().T)
    bold(" MEMORY USAGE ".center(50, "#"))
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))

In [None]:
df_stats(df)

In [None]:
df = df.dropna(subset=["text"])
df = df.drop_duplicates()
df = df.reset_index(drop=True)

# EDA

In [None]:
df["label"].value_counts().plot(kind="pie", autopct="%.1f%%")

In [None]:
tags = Counter(df["label"]).keys()
tags_len = Counter(df["label"]).values()
tag_df = pd.DataFrame(zip(tags, tags_len), columns=["Class", "Count"])
tag_df.plot(x="Class", y="Count", kind="bar", legend=False, grid=False, figsize=(15, 5), cmap='viridis')
plt.title("Class / Count", fontsize=18)
plt.xlabel("Class", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

# Preprocess

In [None]:
df['Text Cleaned'] = [token.lower() for token in df['text']]

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: re.sub('[0-9]+', '', x))

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: x.translate(x.maketrans('', '', string.punctuation)))
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: x.replace('"', '').replace("’", '').replace("'", '').replace("”", ''))

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: re.sub('\S*@\S*\s?', '', x))

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: emoji.replace_emoji(x))

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda x: re.sub('<.*?>', '', x))

In [None]:
smart_words = "/kaggle/input/smartstoplists/SmartStoplist.txt"
smart_stoplist = []
for line in open(smart_words, "r"):
    if line.strip()[0:1] != "#":
        for word in line.split():
            smart_stoplist.append(word)

In [None]:
df['Text Cleaned'] = df['Text Cleaned'].apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in smart_stoplist]))

# N-Grams

In [None]:
def count_ngrams(corpus, ngram, n):
    vec = CountVectorizer(ngram_range=(ngram,ngram)).fit(corpus)
    bow = vec.transform(corpus).sum(axis=0)
    words_freq = sorted([(word, bow[0, idx]) for word, idx in vec.vocabulary_.items()], key=lambda x: x[1], reverse=True)[:n]
    return words_freq

In [None]:
def plot_ngrams(ngram_df, ngram_name):
    plt.figure(figsize=(12, 6))
    plt.bar(data=ngram_df, x="Text", height="Count")
    plt.xticks(rotation=90)
    plt.xlabel(ngram_name)
    plt.ylabel("Count")
    plt.title(ngram_name)
    plt.show()

In [None]:
unigrams = count_ngrams(df["Text Cleaned"], 1, 30)
top_unigram = pd.DataFrame(unigrams, columns=['Text', "Count"])
top_unigram.head()

In [None]:
plot_ngrams(top_unigram, "Unigrams")

In [None]:
bigrams = count_ngrams(df["Text Cleaned"], 2, 30)
top_bigram = pd.DataFrame(bigrams, columns=['Text', "Count"])
top_bigram.head()

In [None]:
plot_ngrams(top_bigram, "Bigrams")

In [None]:
trigrams = count_ngrams(df["Text Cleaned"], 3, 30)
top_trigram = pd.DataFrame(trigrams, columns=['Text', "Count"])
top_trigram.head()

In [None]:
plot_ngrams(top_trigram, "Trigrams")

# Feature Scaling

In [None]:
X = df["Text Cleaned"]
y = df["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
tokenizer = Tokenizer(num_words=10000,
    filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower = True,
    split = " ")
tokenizer.fit_on_texts(X)

In [None]:
word_index = tokenizer.word_index
len(word_index)

In [None]:
X_train_tokenizer = tokenizer.texts_to_sequences(X_train)
X_test_tokenizer = tokenizer.texts_to_sequences(X_test)

In [None]:
num_tokens = [len(tokens) for tokens in X_train_tokenizer + X_test_tokenizer]
num_tokens = np.array(num_tokens)
maxlen = int(np.mean(num_tokens) + (2 * np.std(num_tokens)))
print(maxlen)

In [None]:
X_train_tokenizer = pad_sequences(X_train_tokenizer, maxlen=maxlen)
X_test_tokenizer = pad_sequences(X_test_tokenizer, maxlen=maxlen)

In [None]:
input_dim = len(tokenizer.word_index) + 1
input_dim

In [None]:
embeddings_index = {}
f = open("/kaggle/input/glove840b300dtxt/glove.840B.300d.txt", "r", encoding="utf-8")
for line in tqdm.tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs

f.close()
print("Found %s word vectors." % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((input_dim, 300))
for word, i in tqdm.tqdm(tokenizer.word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Temporal Convolutional Network

In [None]:
inputs = layers.Input(shape=(None,), dtype="float32")
layer = layers.Embedding(input_dim=input_dim, output_dim=300, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inputs)
x = layers.Bidirectional(TCN(64))(x)
outputs = layers.Dense(3, activation="softmax")(x)
model = Model(inputs, outputs)

In [None]:
model.summary()

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="rmsprop",
              metrics=["accuracy"])

In [None]:
tf.keras.utils.plot_model(model, show_layer_names=True, show_shapes=True)

# Train

In [None]:
early_stopping = EarlyStopping(monitor="val_accuracy", patience=3)

In [None]:
history = model.fit(X_train_tokenizer, y_train, epochs=25, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

In [None]:
test_loss, test_acc = model.evaluate(X_test_tokenizer, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

# Results

In [None]:
model_predictions = model.predict(X_test_tokenizer, verbose=0)
model_predictions = [np.argmax(pred) for pred in model_predictions]

In [None]:
model_precision_score = precision_score(y_test, model_predictions, average="weighted")
model_f1_score = f1_score(y_test, model_predictions, average="weighted")
model_recall_score = recall_score(y_test, model_predictions, average="weighted")
model_accuracy_score = accuracy_score(y_test, model_predictions)

print(f"Precision Score = {model_precision_score * 100:.2f}%")
print(f"F1 Score = {model_f1_score * 100:.2f}%")
print(f"Recall Score = {model_recall_score * 100:.2f}%")
print(f"Accuracy Score = {model_accuracy_score * 100:.2f}%")

In [None]:
print(classification_report(y_test, model_predictions))

In [None]:
cm = confusion_matrix(y_test, model_predictions)
fig, ax = plot_confusion_matrix(conf_mat=cm, show_absolute=True, show_normed=True, colorbar=True, class_names=le.classes_)
plt.show()