In [1]:
import streamlit as st
import pandas as pd

# 1. App Heading
st.title("Topic Modelling Dashboard")

# 2. File Upload Option
uploaded_file = st.file_uploader("Upload text file", type=["txt"])
if uploaded_file is not None:
    article_text = uploaded_file.read().decode("utf-8")
else:
    article_text = "No file uploaded yet."

# 3. Keyword Extraction Subheading
st.subheader("Keyword Extraction")

# 4. Top N Entry Box
top_n_keywords = st.number_input("Top N", min_value=1, max_value=100, value=10, step=1)

# 5. Model Dropdown for Keyword Extraction
keyword_model = st.selectbox("Model", ["TF-IDF", "YAKE", "Dummy Model 1"])

# 6. Scrollable Text Box for Keyword Extraction Results
st.text_area("Keywords Extracted", value="Results will be displayed here...", height=150, max_chars=None, key=None)

# 7. Topic Prediction Subheading
st.subheader("Topic Prediction")

# 8. Top N Entry Box for Topic Prediction
top_n_topics = st.number_input("Top N Topics", min_value=1, max_value=100, value=10, step=1)

# 9. Model Dropdown for Topic Prediction
topic_model = st.selectbox("Model", ["LDA", "NMF", "Dummy Model 2"])

# 10. Scrollable Text Box for Topic Prediction Results
st.text_area("Topics Predicted", value="Results will be displayed here...", height=150, max_chars=None, key=None)

# 11. Domain Selection
st.subheader("Select Domain")
domain_selected = st.selectbox("Domain", ["forbes.com", "bbc.com", "cnn.com"])

# 12. Bar Graph of Topics
st.subheader("Bar Graph of Topics")
st.bar_chart(pd.DataFrame({
    'Topics': [5, 7, 8, 10],
    'Count': [3, 1, 2, 1]
}))

# 13. Bar Graph of Sentiments
st.subheader("Bar Graph of Sentiments")
st.bar_chart(pd.DataFrame({
    'Sentiments': ['Positive', 'Negative', 'Neutral'],
    'Count': [3, 1, 1]
}))


2024-08-31 12:37:53.024 
  command:

    streamlit run c:\Users\otto\anaconda3\envs\myenv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-08-31 12:37:53.059 Session state does not function when running a script without `streamlit run`


DeltaGenerator()

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from Preprocess import LoadModels
from Preprocess.Preprocess import  preprocess_text_lemmatize
from sklearn.feature_extraction.text import TfidfVectorizer


In [6]:
text = "This is an example text from which we want to extract important keywords using the TF-IDF vectorizer."
m_path = '../Models/keyword_extractor_tfidf_vectorizer.joblib'

In [19]:
extractor = LoadModels.TfidfExtractor(m_path)

In [20]:
cleaned_text = preprocess_text_lemmatize(text)

In [None]:
keyword_model = 'TF_IDF'

In [None]:
if keyword_model == "YAKE":
    m_path = '../Models/yake_params.json'
    extractor = LoadModels.YAKEExtractor(m_path)
    pass
else:
    m_path = '../Models/keyword_extractor_tfidf_vectorizer.joblib'
    extractor = LoadModels.TfidfExtractor()



In [24]:
keywords = extractor.extract_keywords(cleaned_text)

In [25]:
ktx = "\n".join([f"{i+1}. {v.title()}" for i,v in enumerate(keywords)])

In [38]:
nmf_vector_path = '../Models/nmf_tfidf_vectorizer.joblib'
nmf_model_path = '../Models/nmf_model.joblib'

# paths
lsi_model_path = '../Models/lsi_model.gensim'
lsi_vector_path = '../Models/lsi_dictionary.gensim'

lda_model_path = '../Models/lda_model.joblib'
lda_vector_path =  '../Models/lda_tfidf_vectorizer.joblib'

chosen_model = 'NMF'

if chosen_model == 'LDA':
    topic_model_path = lda_model_path
    topic_vector_path = lda_vector_path
    topic_model_name = 'lda'

elif chosen_model == 'LSI':
    topic_model_path = lsi_model_path
    topic_vector_path = lda_vector_path
    topic_model_name = 'lsi'

elif chosen_model == 'NMF':
    topic_model_path = nmf_model_path
    topic_vector_path = nmf_vector_path
    topic_model_name = 'nmf'

In [32]:
from sklearn.decomposition import LatentDirichletAllocation

In [42]:
import joblib

In [43]:
lda_model = joblib.load(lda_model_path)

In [44]:
if isinstance(lda_model, LatentDirichletAllocation):
    print("Model loaded successfully!")
else:
    print("Error: Loaded model is not an LDA model.")
    

Model loaded successfully!


In [45]:
lda_model.components_

AttributeError: 'LatentDirichletAllocation' object has no attribute 'components_'

In [39]:
tp_model = LoadModels.TopicModelPredictor(topic_model_path, topic_vector_path, topic_model_name, no_top_words=10)

In [40]:
tp_model.predict(cleaned_text)

'like, one, go, make, say, time, work, even, look, get'

In [52]:
import os
import pandas as pd
import psycopg2
from dotenv import load_dotenv
import streamlit as st
import plotly.express as px

# Load environment variables from .env file
load_dotenv()

# Get PostgreSQL credentials from environment variables
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")
database = os.getenv("POSTGRES_DB")

# Connect to PostgreSQL Database and retrieve all data
def get_data_from_db(query):
    conn = psycopg2.connect(
        host=host,
        database=database,
        user=user,
        password=password
    )
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

# Query to get all data from the table
query_all_data = "SELECT * FROM ml_features;"
df = get_data_from_db(query_all_data)

# Step 1: Create select box for domains
unique_domains = df['domain'].unique()
selected_domain = st.selectbox("Select domain", unique_domains)

# Step 2: Filter data based on the selected domain
filtered_df = df[df['domain'] == selected_domain]

# Step 3: Create and display the bar chart of tags distribution
# st.header(f"Tags Distribution for Domain: {selected_domain}")

tag_counts = filtered_df['tags'].str.split(',', expand=True).stack().value_counts()
fig_tags = px.bar(tag_counts, x=tag_counts.index, y=tag_counts.values, labels={'x': 'Tags', 'y': 'Count'})
# st.plotly_chart(fig_tags)

# Step 4: Create and display the bar chart of LDA topics number distribution
# st.header(f"LDA Topics Number Distribution for Domain: {selected_domain}")

lda_counts = filtered_df['lda_topics_n'].value_counts()
fig_lda = px.bar(lda_counts, x=lda_counts.index, y=lda_counts.values, labels={'x': 'LDA Topics Number', 'y': 'Count'})
# st.plotly_chart(fig_lda)

# Step 5: Display unique LDA topics numbers
# st.header(f"Unique LDA Topics Numbers for Domain: {selected_domain}")

ndf = pd.read_csv('../data/lda_pair.csv')
# unique_lda_topics = filtered_df[['lda_topics' ,'lda_topics_n']].unique()
# st.write(f"Unique  Topics Numbers: {unique_lda_topics}")



pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



In [53]:
ndf

Unnamed: 0,lda_topics,lda_topics_n
0,"film, cup, game, world, team, match, play, win...",19
1,"climat, studi, said, research, water, use, sci...",16
2,"gaza, israel, isra, palestinian, hama, said, a...",4
3,"said, polic, court, arrest, investig, case, go...",9
4,"ai, technolog, compani, innov, develop, custom...",11
5,"nigeria, nigerian, africa, african, lago, stat...",13
6,"ukrain, russia, russian, china, said, countri,...",3
7,"googl, appl, use, featur, deal, amazon, user, ...",5
8,"said, peopl, say, work, like, one, go, time, w...",0
9,"stock, ratio, averag, trade, share, compani, r...",6
