In [1]:
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('news-dataset.csv',sep='\t')
df=pd.DataFrame(data)
print(df)

           category                              title  \
0          business    UK house prices dip in November   
1          business  LSE 'sets date for takeover deal'   
2             sport    Harinordoquy suffers France axe   
3          business  Barclays shares up on merger talk   
4          politics   Campaign 'cold calls' questioned   
...             ...                                ...   
1552       business  Hariri killing hits Beirut shares   
1553       politics  MPs issued with Blackberry threat   
1554  entertainment  Bollywood DVD fraudster is jailed   
1555          sport                Ireland v USA (Sat)   
1556           tech  Row brewing over peer-to-peer ads   

                                                content  
0      UK house prices dipped slightly in November, ...  
1      The London Stock Exchange (LSE) is planning t...  
2      Number eight Imanol Harinordoquy has been dro...  
3      Shares in UK banking group Barclays have rise...  
4      Labour

In [2]:
import random
import numpy as np
import tensorflow as tf

random.seed(42)

np.random.seed(42)

tf.random.set_seed(42)

## Feed Forward Neural Network

In [3]:
import os

base_dir = os.path.abspath('C:/temp/my_dir')

if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    print(f"Directory {base_dir} created successfully.")
else:
    print(f"Directory {base_dir} already exists.")

Directory C:\temp\my_dir already exists.


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from keras_tuner import Hyperband
from sklearn.metrics import classification_report, f1_score
import random


# Prepare the dataset
X = df['content']
y = df['category']

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize and pad sequences
max_words = 10000  # maximum number of words to keep in the vocabulary
max_len = 100  # maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Define the model builder function for hyperparameter tuning
def build_model(hp):
    model = Sequential()
    
    # Embedding layer with tunable dimension
    model.add(Embedding(input_dim=max_words, 
                        output_dim=hp.Int('embedding_dim', min_value=50, max_value=200, step=50), 
                        input_length=max_len))
    
    # Flatten layer to feed into dense layers
    model.add(Flatten())
    
    # Dense layers with tunable units and dropout rate
    for i in range(hp.Int('num_dense_layers', 1, 3)):
        model.add(Dense(units=hp.Int(f'dense_units_{i}', min_value=64, max_value=256, step=64), activation='relu'))
        model.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', min_value=0.2, max_value=0.5, step=0.1)))
    
    # Output layer with softmax activation
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    
    # Compile the model with a tunable learning rate
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])), 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# Set up the tuner
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory=base_dir,
    project_name='mlp_text_classification'
)

# Run the hyperparameter search
tuner.search(X_train_padded, y_train, epochs=10, validation_split=0.2)

# Get the best model and evaluate it
best_model_nn = tuner.get_best_models(num_models=1)[0]
print(best_model_nn.summary())
best_model_nn.save('best_modelnn.h5')

# Evaluate the model
y_pred = best_model_nn.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=1)

# Calculate and print classification report with weighted F1-score
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

f1 = f1_score(y_test, y_pred_classes, average='weighted')
print(f"Weighted F1-score: {f1:.3f}")


Reloading Tuner from C:\temp\my_dir\mlp_text_classification\tuner0.json



  saveable.load_own_variables(weights_store.get(inner_path))




None
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
               precision    recall  f1-score   support

     business       0.85      0.76      0.80        70
entertainment       0.70      0.80      0.74        40
     politics       0.83      0.90      0.87        61
        sport       0.99      0.93      0.96        81
         tech       0.79      0.82      0.80        60

     accuracy                           0.85       312
    macro avg       0.83      0.84      0.83       312
 weighted avg       0.85      0.85      0.85       312

Weighted F1-score: 0.847


# CNN

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras_tuner import Hyperband
from sklearn.metrics import classification_report, f1_score

# Prepare the dataset
X = df['content']
y = df['category']

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize and pad sequences
max_words = 10000  # maximum number of words to keep in the vocabulary
max_len = 100  # maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Define the model builder function for hyperparameter tuning
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, 
                        output_dim=hp.Int('embedding_dim', min_value=50, max_value=200, step=50), 
                        input_length=max_len))
    
    model.add(Conv1D(filters=hp.Int('filters', min_value=32, max_value=256, step=32), 
                     kernel_size=hp.Choice('kernel_size', values=[3, 5, 7]), 
                     activation='relu'))
    
    model.add(GlobalMaxPooling1D())
    model.add(Dense(units=hp.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])), 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# Set up the tuner
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory=base_dir,
    project_name='cnn_text_classification'
)

# Run the hyperparameter search
tuner.search(X_train_padded, y_train, epochs=10, validation_split=0.2)

# Get the best model and evaluate it
best_model_cnn = tuner.get_best_models(num_models=1)[0]
best_model_cnn.summary()
best_model_cnn.save('best_modelcnn.h5')

# Evaluate the model
y_pred = best_model_cnn.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=1)

# Calculate and print classification report with weighted F1-score
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

f1 = f1_score(y_test, y_pred_classes, average='weighted')
print(f"Weighted F1-score: {f1:.3f}")


Reloading Tuner from C:\temp\my_dir\cnn_text_classification\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
               precision    recall  f1-score   support

     business       0.82      0.97      0.89        70
entertainment       0.87      0.85      0.86        40
     politics       0.98      0.77      0.86        61
        sport       0.99      0.95      0.97        81
         tech       0.83      0.88      0.85        60

     accuracy                           0.89       312
    macro avg       0.90      0.89      0.89       312
 weighted avg       0.90      0.89      0.89       312

Weighted F1-score: 0.894


# LSTM

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras_tuner import Hyperband
from sklearn.metrics import classification_report, f1_score

# Prepare the dataset
X = df['content']
y = df['category']

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize and pad sequences
max_words = 10000  # maximum number of words to keep in the vocabulary
max_len = 100  # maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Define the model builder function for hyperparameter tuning
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, 
                        output_dim=hp.Int('embedding_dim', min_value=50, max_value=200, step=50), 
                        input_length=max_len))
    
    model.add(LSTM(units=hp.Int('lstm_units', min_value=64, max_value=256, step=64), return_sequences=False))
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])), 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# Set up the tuner
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory= base_dir,
    project_name='lstm_text_classification'
)

# Run the hyperparameter search
tuner.search(X_train_padded, y_train, epochs=10, validation_split=0.2)

# Get the best model and evaluate it
best_model_lstm = tuner.get_best_models(num_models=1)[0]
best_model_lstm.summary()
best_model_lstm.save('best_modellstm.h5')

# Evaluate the model
y_pred = best_model_lstm.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=1)

# Calculate and print classification report with weighted F1-score
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

f1 = f1_score(y_test, y_pred_classes, average='weighted')
print(f"Weighted F1-score: {f1:.3f}")


Reloading Tuner from C:\temp\my_dir\lstm_text_classification\tuner0.json


  saveable.load_own_variables(weights_store.get(inner_path))




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
               precision    recall  f1-score   support

     business       0.84      0.90      0.87        70
entertainment       0.88      0.95      0.92        40
     politics       0.90      0.85      0.87        61
        sport       0.99      0.98      0.98        81
         tech       0.86      0.80      0.83        60

     accuracy                           0.90       312
    macro avg       0.89      0.90      0.89       312
 weighted avg       0.90      0.90      0.90       312

Weighted F1-score: 0.897


# Competition

In [7]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
best_model_lstm = load_model('best_modellstm.h5')

new_data = pd.read_csv('news-challenge.csv',sep='\t')
new_data.head()



Unnamed: 0,ID,title,content
0,0,Virgin Radio offers 3G broadcast,UK broadcaster Virgin Radio says it will beco...
1,1,Police chief backs drinking move,A chief constable has backed the introduction...
2,2,Labour seeks to quell feud talk,Labour's leadership put on a show of unity at...
3,3,Edwards tips Idowu for Euro gold,World outdoor triple jump record holder and B...
4,4,Ivanovic seals Canberra victory,Serbia's Ana Ivanovic captured her first WTA ...


In [8]:
max_len = 100  # same as training
max_words = 10000  
labels = ['business', 'entertainment', 'politics', 'sport', 'tech']

new_data_seq = tokenizer.texts_to_sequences(new_data['content'])

new_data_padded = pad_sequences(new_data_seq, maxlen=max_len, padding='post', truncating='post')

predictions = best_model_lstm.predict(new_data_padded)

predicted_classes = np.argmax(predictions, axis=1)

predicted_labels = [labels[i] for i in predicted_classes]

new_data['category'] = predicted_labels

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


In [9]:
new_data.head()

Unnamed: 0,ID,title,content,category
0,0,Virgin Radio offers 3G broadcast,UK broadcaster Virgin Radio says it will beco...,tech
1,1,Police chief backs drinking move,A chief constable has backed the introduction...,politics
2,2,Labour seeks to quell feud talk,Labour's leadership put on a show of unity at...,politics
3,3,Edwards tips Idowu for Euro gold,World outdoor triple jump record holder and B...,sport
4,4,Ivanovic seals Canberra victory,Serbia's Ana Ivanovic captured her first WTA ...,sport


In [10]:
competiton_data = new_data.drop(columns=['content','title'])
print(competiton_data.head())
competiton_data.to_csv('Group05QBUS6850_2024S2Com.csv', index=False) 

   ID  category
0   0      tech
1   1  politics
2   2  politics
3   3     sport
4   4     sport
