In [5]:
# !python -m ipykernel install --user --name=.venv
# !pip install keras
# !pip install tensorflow

In [6]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [15]:
def function_topic_modeling(url):
    # Step 1: Fetch the web page
    response = requests.get(url)
    html = response.text

    # Step 2: Parse the HTML
    soup = BeautifulSoup(html, 'html.parser')
    class_name = "content"
    research_div = soup.find('div', {'class': class_name})

    # Extract text from the <div> tag
    research_text = re.sub(r'<.*?>', '', str(research_div))
    
    # Preprocess and tokenize the research text
    remove_words = ['for', 'and', 'research', 'explores','group', 'focuses', 'of', 'the', 'to', 'in', 'as', 'to', 'this', 'is', 'not', 'we', 'a', 'his', 'on'
                    'impact', 'environment', 'environmental', 'pollution', 'includes', 'air', 'pollution', 'water', 'soil', 'are', 'indispensable', 'every', 'lakes', 'pollutant', 'untreated', 'thereby',
                     'law', 'energy', 'degradation', 'by', 'former', 'harmful', 'molecule', 'modify', 'into', 'electronic', 'such', 'dyes', 'design']
    
    research_text = ' '.join([word for word in re.findall(r'\w+', research_text.lower()) if word not in remove_words])

    return research_text

In [16]:
# Get preprocessed research areas as a single text
url = "https://facultyweb.kennesaw.edu/bbaruah/research.php"
research_text = function_topic_modeling(url)


In [17]:
# Tokenize and vectorize the text using Keras Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([research_text])
research_sequences = tokenizer.texts_to_sequences([research_text])
vocab_size = len(tokenizer.word_index) + 1

In [18]:
# Create sequences of n-grams for input to LSTM
sequence_length = 10
sequences = []
for i in range(sequence_length, len(research_sequences[0])):
    sequences.append(research_sequences[0][i - sequence_length:i])

In [19]:
X = np.array(sequences)
y = X[:, -1]
X = X[:, :-1]

In [20]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=sequence_length - 1))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
# Train the model
model.fit(X, y, epochs=50, verbose=2)

Epoch 1/50
14/14 - 7s - loss: 5.8401 - accuracy: 0.0023 - 7s/epoch - 514ms/step
Epoch 2/50
14/14 - 0s - loss: 5.8326 - accuracy: 0.0207 - 258ms/epoch - 18ms/step
Epoch 3/50
14/14 - 0s - loss: 5.8226 - accuracy: 0.0161 - 238ms/epoch - 17ms/step
Epoch 4/50
14/14 - 0s - loss: 5.7613 - accuracy: 0.0138 - 240ms/epoch - 17ms/step
Epoch 5/50
14/14 - 0s - loss: 5.4972 - accuracy: 0.0161 - 249ms/epoch - 18ms/step
Epoch 6/50
14/14 - 0s - loss: 5.1946 - accuracy: 0.0230 - 266ms/epoch - 19ms/step
Epoch 7/50
14/14 - 0s - loss: 4.9071 - accuracy: 0.0368 - 287ms/epoch - 20ms/step
Epoch 8/50
14/14 - 0s - loss: 4.6419 - accuracy: 0.0483 - 344ms/epoch - 25ms/step
Epoch 9/50
14/14 - 0s - loss: 4.4021 - accuracy: 0.0506 - 309ms/epoch - 22ms/step
Epoch 10/50
14/14 - 0s - loss: 4.1701 - accuracy: 0.0782 - 289ms/epoch - 21ms/step
Epoch 11/50
14/14 - 0s - loss: 3.9811 - accuracy: 0.1057 - 297ms/epoch - 21ms/step
Epoch 12/50
14/14 - 0s - loss: 3.7907 - accuracy: 0.1356 - 296ms/epoch - 21ms/step
Epoch 13/50
14/

<keras.src.callbacks.History at 0x1ea33003610>

In [22]:
# Generate topic predictions
num_topics = 5
num_words_per_topic = 10
topics = []

# Initial input sequence
current_input = X[0]

for _ in range(num_topics):
    topic = []
    for _ in range(num_words_per_topic):
        # Predict next word
        next_word = np.argmax(model.predict(np.array([current_input])))

        # Append the predicted word to the topic
        topic.append(list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(next_word)])

        # Append the predicted word to the current input and reshape
        current_input = np.append(current_input, next_word)
        current_input = current_input[1:]

    topics.append(topic)

# Print generated topics
print("Generated Topics:")
for idx, topic in enumerate(topics, start=1):
    print(f"Topic {idx}: {', '.join(topic)}")

Generated Topics:
Topic 1: impact, on, among, these, primarily, related, release, christopher, rivers, several
Topic 2: industries, food, drug, agyeman, rivers, tendency, michael, leather, cosmetics, extensively
Topic 3: use, properties, oxygen, optical, optical, james, discharge, due, kristen, household
Topic 4: conductivity, composite, material, sers, pose, primarily, technological, control, sub, craighead
Topic 5: develop, electrically, 3, transparency, woods, concentrations, raman, downer, jeremiah, kelsey
