### Load libraries

- This script can be run in two ways, whether topic model is trained/need to be trained. 
-  Enter the appropriate input(Y/N), if model is trained/needs to be trained respectively

In [21]:
from collections import Counter
from bertopic import BERTopic
from nltk.corpus import stopwords
import plotly.express as px
from skimage import io
from tqdm import tqdm
import altair as alt
import numpy as np
import pandas as pd
import torch
import nltk
import re
import os
from IPython.display import display, HTML
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
stop_words = stopwords.words('english')

import warnings
# warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
seed=699

# Load User Defined Utilities 
import sys
sys.path.append('../../src/utils')
import topic_exploration, text_preprocessing

#### Define Paths to be Used

In [5]:
# topic ids are identified after manually examining the topics related to mentioned tickers
data_paths = {'RAW_DATA': 'datasets/rawdata/market_data/',
                'ARTICLES_PATH': '../../datasets/articles/',
                'SENTIMENT_PATH' : '../../datasets/sentiment_scores/',
                'TOPIC_PATH': '../../datasets/topic_labels/',
                'TRAINED_MODEL_PATH': '../../trained_models/',
                'VISUALIZATION_PATH': '../../visualizations/',
                'TICKERS': ['EIHOTEL.BO', 'ELGIEQUIP.BO', 'IPCALAB.BO', 'PGHL.BO',  'TV18BRDCST.BO'],
                'TOPIC_IDS': [33, 921, 495, 495, 385]
             }

### Choose Run Type - Model Training/Use Saved Model

In [1]:
# Enter 'Y', if the model is already trained/saved at a path or else enter 'N'
model_trained = str(input())

Y


## Load Data

In [2]:
# As articles corps is huge(1.5 mn news articles), its not feasible to train the topic model on all the articles.
# Hence, it is trained on 10% of the articles. To have the diverse distribution of articles in the train dataset, 
# 10% of the articles are taken from ecvery month of data size.
# this train dataset would be used to train the BERTopic model from which topic inference can be done
if  model_trained != 'Y':
    train_df, test_df = text_preprocessing.split_data(data_paths['ARTICLES_PATH'], 10, seed)

### Pre-processing Text

In [22]:
# %%time
if model_trained != 'Y':
    train_df['clean_text'] = train_df['article'].apply(lambda text : 
                            text_preprocessing.preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=text_preprocessing.stop_words))
    
train_df.head()    

Unnamed: 0,id,date,article,month,clean_text
0,851.0,2008-01-27,The central bank’s tight monetary policy may h...,1.0,central tight monetary policy may helped reini...
1,101.0,2008-01-01,Goldman Sachs Group is cutting more than 30 ba...,1.0,goldman sachs group cutting banking job two so...
2,1499.0,2008-01-30,NEW DELHI: India’s global hunt for energy coul...,1.0,new global hunt energy could take new turn aus...
3,2162.0,2008-01-31,MUMBAI: A petition was filed in the Bombay Hig...,1.0,petition filed bombay high court today seeking...
4,18.0,2008-01-07,Goldman Sachs Group is cutting more than 30 ba...,1.0,goldman sachs group cutting banking job two so...


### Train model for 10% articles

### Create Model

In [23]:
# choosing number of articles to process for stub testing
texts = list(train_df['clean_text'].values)

In [23]:
%%time
if model_trained != 'Y':
    topic_model = BERTopic(language='english', calculate_probabilities=False,nr_topics=100,
                               verbose=True)
    topics, probs = topic_model.fit_transform(texts)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

2023-07-11 07:10:08,855 - BERTopic - Transformed documents to Embeddings
2023-07-11 07:10:42,378 - BERTopic - Reduced dimensionality
2023-07-11 07:10:45,221 - BERTopic - Clustered reduced embeddings
2023-07-11 07:11:00,677 - BERTopic - Reduced number of topics from 391 to 100


CPU times: user 53min 56s, sys: 16min 53s, total: 1h 10min 49s
Wall time: 9min 28s


### Load Trained Model for Future Runs

In [4]:
modelname =  'bertopic_model_10pc'
modelpath = data_paths['TRAINED_MODEL_PATH'] + modelname
if model_trained == 'Y':
    topic_model = BERTopic.load(modelpath)
    topics = topic_model.topics_
    probs = topic_model.probabilities_

In [64]:
### search topics
similar_topics, similarity = topic_model.find_topics('news channel', top_n=10)
print(similar_topics)
topic_model.get_topic(similar_topics[0])

[385, 1110, 639, 814, 1067, 294, 1004, 696, 621, 3]


[('channel', 0.04548184672710256),
 ('broadcasting', 0.03970476662396199),
 ('radio', 0.03651455621207887),
 ('broadcaster', 0.02683434158919634),
 ('broadcast', 0.0189560507953621),
 ('doordarshan', 0.01693926097025456),
 ('cable', 0.016867933980789743),
 ('trai', 0.015378753980581012),
 ('advertisement', 0.014428615068312375),
 ('news', 0.012570857188706805)]

In [14]:
# get topic id and correspodning top topic terms     
top_term_df = topic_exploration.topic_top_term(topic_model)
top_term_df

Unnamed: 0,topic_id,top_terms
0,-1,-1_bjp congress minister party government bank...
1,0,0_gas oil petrol diesel litre ongc petroleum l...
2,1,1_drug generic pharmaceutical pharma patent ra...
3,2,2_earnings going valuation probably look kind ...
4,3,3_chat ashwani mitesh deepak short kind stock ...
...,...,...
1187,1186,1186_transmission grid power tallest pgcil dis...
1188,1187,1187_poddar nooyi ambani family siyaram aspen ...
1189,1188,1188_rane chavan adarsh vilasrao congress nara...
1190,1189,1189_qualcomm startup design vakrangee chunaut...


### Topic-Term Matrix

In [16]:
num_topics = len(topic_model.get_topic_info())-1
topic_list = list(range(num_topics))

In [24]:
# 
dom_topic_doc_df = topic_exploration.get_topic_term_matrix(topic_model, train_df)
dom_topic_doc_df.to_csv('dom_topic_doc_df_whole.csv')
dom_topic_doc_df.iloc[11, :].rep_docs
# display(HTML(dom_topic_doc_df.to_html()))

dom_topic_doc_df.head()

Unnamed: 0,topic_id,top_terms,rep_docs
0,0,"[gas, oil, petrol, diesel, litre, ongc, petrol...",petrol diesel price hiked sixth consecutive da...
1,1,"[drug, generic, pharmaceutical, pharma, patent...",winning tender last month supply olanzapine pu...
2,2,"[earnings, going, valuation, probably, look, k...",interview krishna eastspring say bullish gone ...
3,3,"[chat, ashwani, mitesh, deepak, short, kind, s...",chat mitesh technical share view view next mit...
4,4,"[insurance, insurer, premium, life, cover, ird...",new sale group insurance policy grew first two...


### Topic Distribution over documents

In [25]:
topic_dist_df = pd.DataFrame(Counter(topics).items(), columns=['topic_id', 'Count'])
topic_dist_df = topic_dist_df.sort_values(by='topic_id')

# topic_dist_df
freq = topic_dist_df.merge(top_term_df, on='topic_id', how='inner')
freq = freq.sort_values(by='topic_id')
freq.head()

Unnamed: 0,topic_id,Count,top_terms
0,-1,57913,-1_bjp congress minister party government bank...
1,0,1560,0_gas oil petrol diesel litre ongc petroleum l...
2,1,1229,1_drug generic pharmaceutical pharma patent ra...
3,2,1046,2_earnings going valuation probably look kind ...
4,3,925,3_chat ashwani mitesh deepak short kind stock ...


In [29]:
import plotly.express as px

In [33]:
base_chart = alt.Chart(freq.iloc[1:50, ])
bar = base_chart.mark_bar().encode(
        x=alt.X('top_terms:N',sort='-y', axis=alt.Axis(title='Topic ID')),
        y=alt.Y('Count:Q', axis=alt.Axis(title='Number of Articles')))

(bar
).properties(width=800, height=300, title='Topics vs Number of Article Distribution'
).configure_title(fontSize=25
).configure_axis(grid=False, domain=False, 
                 labelFontSize=15,titleFontSize=5)

## Visualize Topic Distribution

In [34]:
hierarchical_topics = topic_model.hierarchical_topics(texts)

100%|██████████| 1190/1190 [00:22<00:00, 52.36it/s]


## Visualize Topic Distribution

In [35]:
topic_model.visualize_topics(top_n_topics=30).show()

### Visualize docs

In [37]:
topic_model.visualize_distribution(probs[:50], min_probability=0.05)

In [70]:
# tree = topic_model.get_topic_tree(hierarchical_topics)
# print(tree)

In [27]:
# topic_model.visualize_hierarchy(top_n_topics=30)
topic_model.nr_topics

None


In [63]:
# fig = topic_model.visualize_barchart(top_n_topics=30)
# fig.show()
# fig.write_image(data_paths['VISUALIZATION_PATH'] + 'topn_30_topics.png')

ticker_topics = io.imread(data_paths['VISUALIZATION_PATH'] + 'topn_30_topics.png')
fig = px.imshow(ticker_topics)
fig.show()

In [60]:
# fig = topic_model.visualize_barchart(set(data_paths['TOPIC_IDS']))
# fig.show()
# fig.write_html(data_paths['VISUALIZATION_PATH'] + 'ticker_topics.html')


# used to show the ticker topics displayed, being generated in plotly, it doesnt show in static notebook
ticker_topics = io.imread(data_paths['VISUALIZATION_PATH'] + 'ticker_topics.png')
fig = px.imshow(ticker_topics)
fig.show()

### Save the topic model for future use

In [17]:
# if model_trained != 'Y':
#     torch.save(topic_model, modelpath)
#     train_df.to_csv(processed_data_path)

BERTopic.save(topic_model, modelpath)

### Topic Search by Keywords

In [12]:
### search topics
# this is used to find the topics manually related to our stock tickers
similar_topics, similarity = topic_model.find_topics('pharma industry', top_n=10)
print(similar_topics)
topic_model.get_topic(similar_topics[2])

[495, 1, 671, 312, 270, 1100, 1061, 582, 742, 67]


[('aurobindo', 0.04994131431380195),
 ('pharma', 0.031768088735685894),
 ('traded', 0.02127429959068986),
 ('ratio', 0.016901913096939433),
 ('stood', 0.01685636980925184),
 ('scrip', 0.013223115007591428),
 ('counter', 0.012103425762206061),
 ('pharmaceutical', 0.011616561217943593),
 ('financials', 0.01136822257169585),
 ('marksans', 0.010961196744251635)]