In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk
import json

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Function to tokenize and chunk text
def tokenize_and_chunk(text):
    # Split text into lines
    lines = text.split('\n')
    # Remove the first 7 lines
    lines = lines[7:]
    # Rejoin the lines into a single string
    text = '\n'.join(lines)
    # Tokenize sentences
    sentences = sent_tokenize(text)
    # Tokenize words and POS tagging for each sentence
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [pos_tag(sentence) for sentence in tokenized_sentences]
    # Perform named entity chunking
    chunked_sentences = [ne_chunk(tagged_sentence) for tagged_sentence in tagged_sentences]
    return chunked_sentences

# Load JSON file
try:
    with open('/content/drive/MyDrive/articles_data (1).json', 'r') as f:
        data = json.load(f)
    print("JSON file loaded successfully.")
except FileNotFoundError:
    print("File not found. Please make sure the file exists and try again.")
except json.JSONDecodeError as e:
    print("Error decoding JSON file:", e)
    # Handle the error as needed

# Tokenize and chunk each article and save to a new file
if 'data' in locals():
    with open('/content/drive/MyDrive/chunked_articles.json', 'w') as f:
        for article in data:
            article_id = article['Title']
            article_text = article['Article Content']
            chunked_article = tokenize_and_chunk(article_text)
            f.write(f'Article ID: {article_id}\n')
            for sentence_tree in chunked_article:
                f.write(str(sentence_tree) + '\n')
            f.write('\n')
else:
    print("No data loaded due to errors. Please check the JSON file and try again.")


In [None]:
pip install tensorflow


In [None]:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load tokenized and padded data
json_file_path = '/content/drive/MyDrive/chunked_articles.json'
model_save_path = '/content/drive/MyDrive/tensorflow_lstm_model'

try:
    with open(json_file_path, 'r') as f:
        data = f.read()
        print("JSON file loaded successfully.")
        print("Data:", data)  # Print the loaded data
except FileNotFoundError:
    print(f"File '{json_file_path}' not found. Please make sure the file exists and try again.")
    exit()

# Process the data
article_contents = []
article_content = []  # Initialize article content list
for line in data:
    if line.startswith("Article ID:"):
        # Start of a new article
        if article_content:
            article_contents.append(article_content)
            article_content = []  # Reset article content for the new article
    elif line.strip():  # Non-empty line
        article_content.append(line.strip())  # Add tokenized/chunked content

# Append the last article's content if present
if article_content:
    article_contents.append(article_content)

# Define TensorFlow model using tf.keras API
vocab_size = 10000  # Placeholder value
embedding_dim = 128  # Placeholder value
max_seq_length = 100  # Placeholder value
hidden_size = 64  # Example hidden size
num_classes = 2  # Example number of classes

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length),
    tf.keras.layers.LSTM(units=hidden_size),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Ensure the directory for saving the model exists
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)

# Save the model
model.save(model_save_path)
