In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Define file paths
SAVE_TRAIN_PATH = '/kaggle/input/training-the-model/train.pkl'
SAVE_TEST_PATH = "/kaggle/input/training-the-model/test.pkl"

import pickle

# Load training data
with open(SAVE_TRAIN_PATH, "rb") as f:
    train_data = pickle.load(f)

# Load test data
with open(SAVE_TEST_PATH, "rb") as f:
    test_data = pickle.load(f)

print("Data successfully loaded!")
print(train_data.keys())
print(test_data.keys())


In [None]:
import gensim

# Define the path to the embeddings file
file_path = '/kaggle/input/conceptnet/numberbatch-en-19.08.txt'

# Load the ConceptNet Numberbatch word vectors
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=False)

# Example: Get the vector for the word 'apple'
apple_vector = word_vectors['apple']
print(apple_vector)


In [None]:
def embedding_generator(text_data, word_vectors, batch_size=32):
    for i in range(0, len(text_data), batch_size):
        batch_texts = text_data[i : i + batch_size]
        batch_embeddings = [get_embedding_vector(text, word_vectors) for text in batch_texts]
        yield np.array(batch_embeddings, dtype=np.float32)

In [None]:
train_gen = embedding_generator(train_data["text"], word_vectors)
test_gen = embedding_generator(test_data["text"], word_vectors)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.utils import Sequence
import gensim

In [None]:
# Optimized Data Generator
class TextDataGenerator(Sequence):
    def __init__(self, texts, labels, word_vectors, batch_size=32, max_length=200):
        self.texts = texts
        self.labels = labels
        self.word_vectors = word_vectors
        self.batch_size = batch_size
        self.max_length = max_length
        self.embedding_dim = word_vectors.vector_size
        
        # Pre-convert all texts to embedding indices to speed up batch generation
        self.preprocessed = [self.text_to_embeddings(text) for text in texts]

    def text_to_embeddings(self, text):
        tokens = text.split()[:self.max_length]
        return [self.word_vectors.key_to_index[word] for word in tokens if word in self.word_vectors]

    def __len__(self):
        return int(np.ceil(len(self.texts) / self.batch_size))

    def __getitem__(self, idx):
        batch_texts = self.preprocessed[idx*self.batch_size : (idx+1)*self.batch_size]
        batch_labels = self.labels[idx*self.batch_size : (idx+1)*self.batch_size]

        # Pad sequences and create embeddings matrix
        X_batch = np.zeros((len(batch_texts), self.max_length, self.embedding_dim), dtype=np.float32)
        for i, text_indices in enumerate(batch_texts):
            if text_indices:
                X_batch[i, :len(text_indices)] = self.word_vectors.vectors[text_indices]

        # Convert labels to categorical (assuming classes are 1-4)
        y_batch = tf.keras.utils.to_categorical(np.array(batch_labels) - 1, num_classes=4)
        
        return X_batch, y_batch

In [None]:
# Create generators
train_generator = TextDataGenerator(
    train_data["text"], 
    train_data["class_index"], 
    word_vectors,
    batch_size=64  # Increased batch size for better GPU utilization
)

test_generator = TextDataGenerator(
    test_data["text"],
    test_data["class_index"],
    word_vectors,
    batch_size=64
)

In [None]:
# Optimized Model Architecture
model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True), input_shape=(200, 300)),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
# Create train and test generators
train_generator = TextDataGenerator(train_data["text"], train_data["class_index"], word_vectors, batch_size=32)
test_generator = TextDataGenerator(test_data["text"], test_data["class_index"], word_vectors, batch_size=32)

# Use the generator in model training
model.fit(train_generator, validation_data=test_generator, epochs=20)

In [None]:
model.save('/kaggle/working/news_classification_model.h5')  # Saves in the working directory