In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from gensim import corpora, models
from gensim.utils import simple_preprocess
import spacy
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
data=pd.read_csv("/content/drive/MyDrive/all note books/descriptive-statistics.csv")

In [13]:
data=data.drop(columns=["Unnamed: 0","year",	"month",	"day",	"day_of_week",	"time"])

In [14]:
data

Unnamed: 0,headline,url,publisher,date,stock,headline-length
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14
...,...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX,7
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX,7
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX,8
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX,8


In [15]:
def get_sentiment(text):
    analysis = TextBlob(text)
    # Polarity is between -1 (negative) and 1 (positive)
    return analysis.sentiment.polarity

In [16]:
data['sentiment_polarity'] = data['headline'].apply(get_sentiment)

In [17]:
def classify_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [18]:
data['sentiment'] = data['sentiment_polarity'].apply(classify_sentiment)

In [19]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Neutral,934914
Positive,341178
Negative,131236


In [20]:
data

Unnamed: 0,headline,url,publisher,date,stock,headline-length,sentiment_polarity,sentiment
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7,0.00,Neutral
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7,0.00,Neutral
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5,0.00,Neutral
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7,0.00,Neutral
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14,0.00,Neutral
...,...,...,...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX,7,0.15,Positive
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX,7,0.15,Positive
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX,8,0.00,Neutral
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX,8,0.00,Neutral


In [23]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [24]:
def preprocess(texts):
    return [[token.lemma_ for token in doc if token.lemma_ not in spacy.lang.en.stop_words.STOP_WORDS and token.lemma_.isalpha()] for doc in nlp.pipe(texts, batch_size=500, n_process=-1)]

In [27]:
# Batch processing of data
batch_size = 1000 # Adjust batch size depending on memory
processed_headlines = []


In [28]:
for i in range(0, len(data), batch_size):
    batch = data['headline'][i:i + batch_size]
    processed_batch = preprocess(batch)
    processed_headlines.extend(processed_batch)

In [30]:
data['processed_headline'] = processed_headlines

In [31]:
dictionary = corpora.Dictionary(data['processed_headline'])


In [32]:
# Create a dictionary from the processed headlines
dictionary = corpora.Dictionary(data['processed_headline'])


In [33]:
# Optionally filter out extremes to reduce the size of the dictionary
dictionary.filter_extremes(no_below=10, no_above=0.5)

In [34]:
# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in data['processed_headline']]

In [35]:
# Convert processed headlines back to text
processed_texts = [' '.join(text) for text in data['processed_headline']]


In [36]:
# Vectorize the text
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(processed_texts)


In [37]:
# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=5, max_iter=10, learning_method='batch', random_state=0, n_jobs=-1)
lda_model.fit(X)

In [46]:
n_top_words = 10

In [47]:
feature_names = vectorizer.get_feature_names_out()

In [48]:
topics = []

In [49]:
for topic_idx, topic in enumerate(lda_model.components_):
    # Extract the top words for this topic
    top_words = ",".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

    # Print the topic index and top words
    print(f"Topic {topic_idx + 1}: {top_words}")

    # Append the topic and top words to the list
    topics.append(f"Topic {topic_idx + 1}: {top_words}")

Topic 1: share,announce,new,announces,deal,reports,update,report,offering,ceo
Topic 2: stock,market,week,share,trade,earning,session,mid,day,company
Topic 3: pt,buy,raises,target,price,initiates,neutral,coverage,maintains,lowers
Topic 4: est,vs,eps,reports,sales,sale,ep,revenue,adj,analyst
Topic 5: benzinga,market,morgan,estimate,stock,upgrades,beat,morning,downgrades,update


In [50]:
df_topics = pd.DataFrame({'Topic': [f'Topic {i+1}' for i in range(len(topics))],
                          'Top Words': [topic.split(': ')[1] for topic in topics]})


In [51]:
df_topics

Unnamed: 0,Topic,Top Words
0,Topic 1,"share,announce,new,announces,deal,reports,upda..."
1,Topic 2,"stock,market,week,share,trade,earning,session,..."
2,Topic 3,"pt,buy,raises,target,price,initiates,neutral,c..."
3,Topic 4,"est,vs,eps,reports,sales,sale,ep,revenue,adj,a..."
4,Topic 5,"benzinga,market,morgan,estimate,stock,upgrades..."


In [52]:
data

Unnamed: 0,headline,url,publisher,date,stock,headline-length,sentiment_polarity,sentiment,processed_headline
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54,A,7,0.00,Neutral,"[stock, hit, week, Highs, Friday]"
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20,A,7,0.00,Neutral,"[stock, hit, week, Highs, Wednesday]"
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07,A,5,0.00,Neutral,"[big, Movers, Friday]"
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06,A,7,0.00,Neutral,"[stock, Friday, Mid, Day, session]"
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59,A,14,0.00,Neutral,"[b, Securities, Maintains, Neutral, Agilent, T..."
...,...,...,...,...,...,...,...,...,...
1407323,Top Narrow Based Indexes For August 29,https://www.benzinga.com/news/11/08/1888782/to...,Monica Gerson,2011-08-29 00:00:00,ZX,7,0.15,Positive,"[Top, Narrow, Based, Indexes, August]"
1407324,Recap: Wednesday's Top Percentage Gainers and ...,https://www.benzinga.com/news/earnings/11/06/1...,Benjamin Lee,2011-06-22 00:00:00,ZX,7,0.15,Positive,"[recap, Wednesday, Top, Percentage, Gainers, L..."
1407325,UPDATE: Oppenheimer Color on China Zenix Auto ...,https://www.benzinga.com/analyst-ratings/analy...,BenzingaStaffL,2011-06-21 00:00:00,ZX,8,0.00,Neutral,"[update, Oppenheimer, Color, China, Zenix, Aut..."
1407326,Oppenheimer Initiates China Zenix At Outperfor...,https://www.benzinga.com/analyst-ratings/price...,Joe Young,2011-06-21 00:00:00,ZX,8,0.00,Neutral,"[Oppenheimer, Initiates, China, Zenix, Outperf..."


In [53]:
data.to_csv("topic-model.csv")