In [1]:
import numpy as np 
import pandas as pd 
import pickle

from keras.layers import Input, Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import class_weight
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from hyperopt import hp, tpe, fmin

from tensorflow.keras.models import load_model

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

2024-04-20 10:44:08.462925: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 10:44:08.463079: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 10:44:08.616166: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/news-aggregator-data-set/valid.txt
/kaggle/input/news-aggregator-data-set/test.txt
/kaggle/input/news-aggregator-data-set/train.txt


# **Data Preparation**

In [2]:
'''
- Read in the files
- Assign the columns headers to the dataframe
'''
train_df = pd.read_csv('/kaggle/input/news-aggregator-data-set/train.txt', delimiter='\t', header=None, encoding='utf-8')
valid_df = pd.read_csv('/kaggle/input/news-aggregator-data-set/valid.txt', delimiter='\t', header=None, encoding='utf-8')
test_df = pd.read_csv('/kaggle/input/news-aggregator-data-set/test.txt', delimiter='\t', header=None, encoding='utf-8')

# Define the columns header
columns = ['CATEGORY', 'TITLE']

# Assign the headers to the dataframe
train_df.columns = columns
valid_df.columns = columns
test_df.columns = columns

In [3]:
def label_encode(df):
    """
    Encode the 'CATEGORY' column of the DataFrame to numerical labels.

    Args:
    df (DataFrame): The DataFrame containing the 'CATEGORY' column to be encoded.

    Returns:
    DataFrame: The DataFrame with the 'CATEGORY' column replaced by numerical labels and the column renamed to 'LABEL'.
    """

    # Map category labels to numerical labels
    df.loc[df['CATEGORY'] == 'e', 'LABEL'] = 0
    df.loc[df['CATEGORY'] == 'b', 'LABEL'] = 1
    df.loc[df['CATEGORY'] == 't', 'LABEL'] = 2
    df.loc[df['CATEGORY'] == 'm', 'LABEL'] = 3

    # Drop the original 'CATEGORY' column if present
    if 'CATEGORY' in df.keys():
        df = df.drop('CATEGORY', axis='columns')

    # Convert the 'LABEL' column to integer type
    df['LABEL'] = df['LABEL'].astype(int)
    return df

train_df = label_encode(train_df)
valid_df = label_encode(valid_df)
test_df = label_encode(test_df)

train_df

Unnamed: 0,TITLE,LABEL
0,update 1 yellen prepares wall st for more whol...,1
1,kanye raps about how awesome kim is on future ...,0
2,update 1 facebook to use satellites drones to ...,2
3,garth ancier counter sues michael egan over se...,0
4,update 1 mercedes recalls 284000 cars in us ca...,2
...,...,...
10679,mick jagger issues single statement regarding ...,0
10680,rpt fitch affirms thailand s pttgc at aa tha o...,1
10681,deutsche bank says health checks pose big unkn...,1
10682,what kids actually think about sheryl sandberg...,0


In [4]:
def create_balanced_dataset(df):
    """
    Create a balanced dataset by randomly sampling an equal number of samples for each class.

    Args:
    df (DataFrame): The DataFrame containing the dataset to be balanced.

    Returns:
    DataFrame: The balanced DataFrame with an equal number of samples for each class.
    """

    # Find the minimum number of samples among all classes
    min_samples_count = min(df['LABEL'].value_counts())

    # Shuffle the DataFrame
    shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Extract an equal number of samples for each class
    e = shuffled[shuffled['LABEL'] == 0][:min_samples_count]
    b = shuffled[shuffled['LABEL'] == 1][:min_samples_count]
    t = shuffled[shuffled['LABEL'] == 2][:min_samples_count]
    m = shuffled[shuffled['LABEL'] == 3][:min_samples_count]

    # Concatenate the sampled dataframes to create the balanced dataset
    balanced_df = pd.concat([e, b, t, m], ignore_index=True)

    # Shuffle the balanced dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return balanced_df

In [5]:
def create_tokenizer_and_data_set(train_df, valid_df, test_df):
    """
    Create tokenizer and prepare the dataset for training.

    Args:
    train_df (DataFrame): DataFrame containing the training data.
    valid_df (DataFrame): DataFrame containing the validation data.
    test_df (DataFrame): DataFrame containing the test data.

    Returns:
    Tuple: A tuple containing tokenizer, vocabulary size, maximum sequence length,
           training data, validation data, test data, training labels, validation labels, and test labels.
    """

    # Combine titles from all datasets for tokenizer fitting
    texts_to_fit = []
    texts_to_fit.extend(train_df['TITLE'])
    texts_to_fit.extend(valid_df['TITLE'])
    texts_to_fit.extend(test_df['TITLE'])

    # Define tokenizer with vocabulary size
    n_most_common_words = 8000
    tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
    tokenizer.fit_on_texts(texts_to_fit)
    
    # Save Tokenizer to file
    with open('tokenizer.pickle', 'wb') as handle:
       pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Convert texts to sequences and find maximum sequence length
    sequences = tokenizer.texts_to_sequences(texts_to_fit)
    max_len = len(max(sequences, key=len))

    # Define vocabulary size
    vocabulary_size = len(tokenizer.word_index) + 1
    
    # Pad sequences for train, valid, and test datasets
    X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['TITLE'].values), maxlen=max_len)
    X_valid = pad_sequences(tokenizer.texts_to_sequences(valid_df['TITLE'].values), maxlen=max_len)
    X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['TITLE'].values), maxlen=max_len)

    # Convert labels to one-hot encoded format
    y_train = to_categorical(train_df['LABEL'], num_classes=4)
    y_valid = to_categorical(valid_df['LABEL'], num_classes=4)
    y_test = to_categorical(test_df['LABEL'], num_classes=4)
    
    return tokenizer, vocabulary_size, max_len, X_train, X_valid, X_test, y_train, y_valid, y_test

# **Create and Train model**

In [6]:
def create_and_train_model(vocabulary_size, max_len, X_train, X_valid, y_train, y_valid, class_weight, num_layers, num_units, dropout, verbose=0):
    """
    Create and train a sequential model using LSTM layers.

    Args:
    vocabulary_size (int): Size of the vocabulary.
    max_len (int): Maximum sequence length.
    X_train (numpy.ndarray): Training data.
    X_valid (numpy.ndarray): Validation data.
    y_train (numpy.ndarray): Training labels.
    y_valid (numpy.ndarray): Validation labels.
    class_weight (dict): Dictionary containing class weights.
    num_layers (int): Number of LSTM layers.
    num_units (int): Number of units in each LSTM layer.
    dropout (float): Dropout rate.
    verbose (int): Verbose for model fit.

    Returns:
    float: Validation loss of the trained model.
    """

    epochs = 30
    emb_dim = 200
    batch_size = 256
    
    # Define the model architecture
    model = Sequential()
    model.add(Input(shape=(max_len,)))
    model.add(Embedding(input_dim=vocabulary_size, output_dim=emb_dim))
    model.add(SpatialDropout1D(dropout))
    for i in range(num_layers):
        if i != num_layers - 1:
            model.add(LSTM(num_units, dropout=dropout, recurrent_dropout=dropout, return_sequences=True))
        else:
            model.add(LSTM(num_units, dropout=dropout, recurrent_dropout=dropout))
            
    model.add(Dense(4, activation='softmax'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    
    # Initialize a checkpoint to save the model with the best validation loss
    checkpoint_path = "best_model.keras"
    checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, mode='min')
    early_stop = EarlyStopping(monitor='val_loss', patience=7, min_delta=0.0001)
    
    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_valid, y_valid),
                        class_weight=class_weight, verbose=verbose, callbacks=[checkpoint, early_stop])
    
    # Evaluate the model on the validation set
    accr = model.evaluate(X_valid, y_valid, verbose=verbose)
    
    # Return the validation loss
    return accr

# **Hyper Parameter Tuning**

In [7]:
# Create a balanced dataset for the validation set
valid_df = create_balanced_dataset(valid_df)

# Tokenize the text data and create datasets for training, validation, and testing
tokenizer, vocabulary_size, max_len, X_train, X_valid, X_test, y_train, y_valid, y_test = create_tokenizer_and_data_set(train_df, valid_df, test_df)

# Calculate class weights for imbalance correction
class_counts = train_df['LABEL'].value_counts().to_dict()
total_samples = sum(class_counts.values())
class_weight = {}
for clss, count in class_counts.items():
    class_weight[clss] = total_samples / (len(class_counts) * count)

# Define the function for training with hyperparameter optimization
def train_fn(params):
    num_layers = int(params['num_layers'])
    num_units = int(params['num_units'])
    dropout = params['dropout']

    loss, accuracy = create_and_train_model(vocabulary_size, max_len, X_train, X_valid, y_train, y_valid, class_weight, num_layers, num_units, dropout)
        
    return loss

# Perform hyperparameter optimization using Bayesian optimization
best = fmin(fn=train_fn,
            space={
                'num_layers': hp.quniform('num_layers', 1, 4, 1),
                'num_units': hp.quniform('num_units', 8, 32, 8), 
                'dropout': hp.uniform('dropout', 0.1, 0.8)  
            },
            algo=tpe.suggest,
            max_evals=50)

# Return the best hyperparameters found during optimization
best


100%|██████████| 50/50 [36:15<00:00, 43.51s/trial, best loss: 0.36765602231025696]


{'dropout': 0.5456938443766727, 'num_layers': 1.0, 'num_units': 8.0}

# **Retrain with best Hyper Parameters**

In [8]:
# Retrain the model with best hyper-parameters
num_layers = int(best['num_layers'])
num_units = int(best['num_units'])
dropout = best['dropout']

loss, accuracy = create_and_train_model(vocabulary_size, max_len, X_train, X_valid, y_train, y_valid, class_weight, num_layers, num_units, dropout, verbose=1)

Epoch 1/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - acc: 0.3297 - loss: 1.3772 - val_acc: 0.6756 - val_loss: 1.3313
Epoch 2/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - acc: 0.6690 - loss: 1.3090 - val_acc: 0.7321 - val_loss: 1.1604
Epoch 3/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - acc: 0.8010 - loss: 1.1000 - val_acc: 0.7917 - val_loss: 0.9220
Epoch 4/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - acc: 0.8397 - loss: 0.8812 - val_acc: 0.8244 - val_loss: 0.7410
Epoch 5/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - acc: 0.8860 - loss: 0.6568 - val_acc: 0.8423 - val_loss: 0.6057
Epoch 6/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - acc: 0.9036 - loss: 0.5085 - val_acc: 0.8661 - val_loss: 0.5179
Epoch 7/30
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - acc: 0.

# **Evaluate the model with test set**

In [9]:
# Path to the saved model file
model_path = '/kaggle/working/best_model.keras'

# Load the model
model = load_model(model_path)

# Check the architecture of the loaded model
model.summary()

# Predict probabilities for each class for the test set
y_pred_prob = model.predict(X_test)

# Convert probabilities to predicted class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Compute the classification report
report = classification_report(y_test.argmax(axis=1), y_pred, target_names=['entertainment', 'business', 'science and technology', 'health'], output_dict=True)

# Print precision, recall, and F1-score for each label and macro-average scores
for label in report.keys():
    if label != 'accuracy':
        precision = report[label]['precision']
        recall = report[label]['recall']
        f1_score = report[label]['f1-score']
        print(f"Label: {label}")
        print(f"Precision: {precision:.3f}")
        print(f"Recall: {recall:.3f}")
        print(f"F1-score: {f1_score:.3f}")
        print()
        
# Print macro-average scores
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']
macro_f1_score = report['macro avg']['f1-score']
print("Macro-average:")
print(f"Precision: {macro_precision:.3f}")
print(f"Recall: {macro_recall:.3f}")
print(f"F1-score: {macro_f1_score:.3f}")


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Label: entertainment
Precision: 0.973
Recall: 0.943
F1-score: 0.958

Label: business
Precision: 0.954
Recall: 0.946
F1-score: 0.950

Label: science and technology
Precision: 0.761
Recall: 0.825
F1-score: 0.792

Label: health
Precision: 0.728
Recall: 0.798
F1-score: 0.761

Label: macro avg
Precision: 0.854
Recall: 0.878
F1-score: 0.865

Label: weighted avg
Precision: 0.925
Recall: 0.921
F1-score: 0.923

Macro-average:
Precision: 0.854
Recall: 0.878
F1-score: 0.865
