# Topic Modelling

In [2]:
import gdown
import pandas as pd

# Google Drive file ID
file_id = '11Ssx7kSIuxXlAWPrWRZsrED-5SYbJVX7'
# Output file path
destination = '/content/raw_analyst_ratings.csv'

# Download the file using gdown
gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

# Load the CSV file into a DataFrame
df = pd.read_csv(destination)
df.head()


Downloading...
From (original): https://drive.google.com/uc?id=11Ssx7kSIuxXlAWPrWRZsrED-5SYbJVX7
From (redirected): https://drive.google.com/uc?id=11Ssx7kSIuxXlAWPrWRZsrED-5SYbJVX7&confirm=t&uuid=850210d7-f247-47d2-872e-f72daa97beee
To: /content/raw_analyst_ratings.csv
100%|██████████| 328M/328M [00:04<00:00, 68.3MB/s]


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [3]:

from gensim import corpora, models
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# import pyLDAvis
headline_df=df.copy()
# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = [word.lower() for word in text.split() if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Tokenize headlines
headline_df['tokens'] = headline_df['headline'].apply(preprocess)

# Create dictionary and corpus
dictionary = corpora.Dictionary(headline_df['tokens'])
corpus = [dictionary.doc2bow(text) for text in headline_df['tokens']]

# Apply LDA
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

for idx, topic in lda_model.print_topics(-1):
    words = [word.split("*")[1].replace('"', '').strip() for word in topic.split(" + ")]
    print(f'Topic {idx}:')
    print(", ".join(words))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Topic 0:
stocks, top, benzinga, financials, watch, new, morgan, downgrades, western, hit
Topic 1:
market, stocks, industrials, session, moving, stock, biggest, morning, movers, zynga
Topic 2:
shares, sector, retail, trading, pro, staples, higher, us, tech, companies
Topic 3:
vs, eps, price, sales, earnings, reports, pt, etf, raises, maintains
Topic 4:
energy, utilities, check, announces, option, says, reports, industry, ceo, calls
