In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.api._v2.keras.layers import Input, Embedding, SimpleRNN, Dense, TextVectorization, GRU, Dropout
from keras.api._v2.keras.models import Sequential
from keras.api._v2.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from load_data import load_l2_data


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
df = pd.read_csv('news-classification.csv', encoding='utf-8')
df.head()

Unnamed: 0,data_id,id,date,source,title,content,author,url,published,published_utc,collection_utc,category_level_1,category_level_2
0,1809,abcnews--2019-10-31--Virginia mom charged with...,2019-10-31,abcnews,Virginia mom charged with murder in 2-year-old...,The Virginia woman whose 2-year-old son was fo...,,https://abcnews.go.com/US/wireStory/virginia-m...,"Thu, 31 Oct 2019 16:49:56 -0400",1572554996,1572559512,"crime, law and justice",crime
1,1980,abcnews--2019-11-07--2 escaped murder suspects...,2019-11-07,abcnews,2 escaped murder suspects arrested at US-Mexic...,Authorities are trying to determine if anyone ...,,https://abcnews.go.com/US/wireStory/escaped-mu...,"Thu, 07 Nov 2019 00:13:12 -0500",1573103592,1573131986,"crime, law and justice",crime
2,1995,abcnews--2019-11-07--Family turns in escaped b...,2019-11-07,abcnews,"Family turns in escaped boy, 13, suspected in ...",A 13-year-old suspect in a double homicide who...,,https://abcnews.go.com/US/wireStory/family-tur...,"Thu, 07 Nov 2019 07:39:54 -0500",1573130394,1573131982,"crime, law and justice",crime
3,2740,abcnews--2019-12-02--Mother charged with murde...,2019-12-02,abcnews,Mother charged with murder in deaths of 2 youn...,The mother of two young children found hanging...,,https://abcnews.go.com/US/wireStory/mother-cha...,"Mon, 02 Dec 2019 11:30:59 -0500",1575304259,1575308811,"crime, law and justice",crime
4,7038,ageofautism--2019-04-12--Physician Father and ...,2019-04-12,ageofautism,"Physician, Father and Caretaker of 29 Year Old...","""One family member said Derek “can be violent ...",Age of Autism,http://feedproxy.google.com/~r/ageofautism/~3/...,2019-04-12 09:00:00+00:00,1555074000,1567543083,"crime, law and justice",crime


In [13]:


default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english')
default_stopwords = default_stopwords + ['said', 'would','even','according','could','year',
                                         'years','also','new','people','old,''one','two','time',
                                         'first','last','say','make','best','get','three','make',
                                         'year old','told','made','like','take','many','set','number',
                                         'month','week','well','back']
shortword = re.compile(r'\W*\b\w{1,4}\b\d')
BAD_SYMBOLS_RE = re.compile("[^a-zA-Z,\\d]")
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s) if len(w)>=3]

    def preprocessing_text(text):
        text = text.lower()
        text=text.replace('\n',' ').replace('\xa0',' ').replace('-',' ').replace('ó','o').replace('ğ','g').replace('á','a').replace("'"," ")
        text=re.sub(r'\d+','', text)
        text=re.sub(r'http\S+', '', text)
        text=BAD_SYMBOLS_RE.sub(' ', text)
        text=REPLACE_IP_ADDRESS.sub('', text)
        text=REPLACE_BY_SPACE_RE.sub(' ', text)
        text=' '.join(word for word in text.split() if len(word)>3)

        return text

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters + '0123456789')))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def lemm_text(text, lemm=WordNetLemmatizer()):
        tokens = tokenize_text(text)
        return ' '.join([lemm.lemmatize(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)

    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    text = preprocessing_text(text)
    text = remove_special_characters(text) # remove punctuation and symbols
    text = lemm_text(text) # lemmatizer
    text = remove_stopwords(text) # remove stopwords

    return text

In [14]:
label_encoder = LabelEncoder()
df['category_level_1'] = label_encoder.fit_transform(df['category_level_1'])

In [15]:
l2_features = []
l2_target = []
# Iterate over unique values in 'category_level_1'
for category_level_1_value in range(0, 17):
    # Filter DataFrame based on 'category_level_1'
    subset_df = df[df['category_level_1'] == category_level_1_value]

    # Create a new DataFrame with selected columns
    subset_features = subset_df[['source', 'title', 'content']]
    subset_features = subset_features.apply(lambda row: ' '.join(row.astype(str)), axis=1)
    subset_features = subset_features.apply(clean_text)
    
    subset_target = subset_df['category_level_2']
    # Append the new DataFrame to the list
    l2_features.append(subset_features)
    l2_target.append(subset_target)

# Now, list_of_dataframes contains DataFrames for each 'category_level_1'

In [16]:
l2_x_train, l2_x_test, l2_x_val, l2_y_train, l2_y_test, l2_y_val = load_l2_data(l2_features, l2_target)

In [17]:
def build_model_dnn(x_train, x_val, x_test, nClasses):
    max_features = 20000
    vectorizer_x = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))

    x_train = vectorizer_x.fit_transform(x_train).toarray()
    x_val = vectorizer_x.transform(x_val).toarray()
    x_test = vectorizer_x.transform(x_test).toarray()

    model = Sequential()
    model.add(Dense(400, input_dim=max_features))  # Specify input_dim here
    model.add(Dropout(0.5))

    model.add(Dense(nClasses, activation='softmax'))

    model.compile(loss=SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])

    return model, x_train, x_val, x_test

In [18]:
l2_dnn_models = []
for i in range(0, 17):
    l2_dnn_models.append([])

for i in range(0, 17):
    print("Model: ",i)
    with tf.device("CPU"):
        l2_dnn_models[i], l2_x_train[i], l2_x_val[i], l2_x_test[i] = build_model_dnn(l2_x_train[i], l2_x_val[i], l2_x_test[i], len(np.unique(l2_target[i])))



Model:  0
Model:  1
Model:  2
Model:  3
Model:  4
Model:  5
Model:  6
Model:  7
Model:  8
Model:  9
Model:  10
Model:  11
Model:  12
Model:  13
Model:  14
Model:  15
Model:  16


In [19]:
callback = tf.keras.callbacks.EarlyStopping(patience=1)

for i in range(0, 17):
    print("Model: ", i)
    with tf.device("CPU"):
        l2_dnn_models[i].fit(
        l2_x_train[i], l2_y_train[i],
        validation_data=(l2_x_val[i], l2_y_val[i]),
        epochs=6,
        batch_size=16,
        callbacks=[callback]
        )

Model:  0


Epoch 1/6
Epoch 2/6
Model:  1
Epoch 1/6
Epoch 2/6
Model:  2
Epoch 1/6
Epoch 2/6
Model:  3
Epoch 1/6
Epoch 2/6
Model:  4
Epoch 1/6
Epoch 2/6
Model:  5
Epoch 1/6
Epoch 2/6
Model:  6
Epoch 1/6
Epoch 2/6
Model:  7
Epoch 1/6
Epoch 2/6
Model:  8
Epoch 1/6
Epoch 2/6
Model:  9
Epoch 1/6
Epoch 2/6
Model:  10
Epoch 1/6
Epoch 2/6
Model:  11
Epoch 1/6
Epoch 2/6
Model:  12
Epoch 1/6
Epoch 2/6
Model:  13
Epoch 1/6
Epoch 2/6
Model:  14
Epoch 1/6
Epoch 2/6
Model:  15
Epoch 1/6
Epoch 2/6
Model:  16
Epoch 1/6
Epoch 2/6


In [20]:
accuracies = []
for i in range(0, 17):
    loss, accuracy = l2_dnn_models[i].evaluate(l2_x_test[i], l2_y_test[i])
    print("Model: ", i)
    print("Loss: ", loss)
    print("Accuracy: ", accuracy)
    accuracies.append(accuracy)

print(accuracies)

Model:  0
Loss:  0.5862075686454773
Accuracy:  0.8444444537162781
Model:  1
Loss:  0.6519753932952881
Accuracy:  0.8833333253860474
Model:  2
Loss:  1.0737907886505127
Accuracy:  0.7066666483879089
Model:  3
Loss:  1.2212083339691162
Accuracy:  0.6666666865348816
Model:  4
Loss:  1.094390630722046
Accuracy:  0.7333333492279053
Model:  5
Loss:  1.4224733114242554
Accuracy:  0.5604395866394043
Model:  6
Loss:  0.9994742274284363
Accuracy:  0.8333333134651184
Model:  7
Loss:  1.107757568359375
Accuracy:  0.7714285850524902
Model:  8
Loss:  1.0327329635620117
Accuracy:  0.7777777910232544
Model:  9
Loss:  1.1829743385314941
Accuracy:  0.6761904954910278
Model:  10
Loss:  0.49868836998939514
Accuracy:  1.0
Model:  11
Loss:  1.3113579750061035
Accuracy:  0.6518518328666687
Model:  12
Loss:  1.5496000051498413
Accuracy:  0.6166666746139526
Model:  13
Loss:  1.407043218612671
Accuracy:  0.625
Model:  14
Loss:  0.9935981631278992
Accuracy:  0.8666666746139526
Model:  15
Loss:  0.969155728816986