# Text Analysis(Topic Modeling)

In [None]:
import pandas as pd
import re
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download(['punkt', 'stopwords'])

# Load data
df = pd.read_csv('C:/raw_analyst_ratings/raw_analyst_ratings.csv', parse_dates=['date'])

# Preprocessing
stop_words = set(stopwords.words('english'))
financial_stopwords = {'said', 'will', 'new', 'company', 'firm', 'year', 'market'}
stop_words.update(financial_stopwords)

def preprocess(text):
    # Remove special characters but keep financial terms like "$500M"
    text = re.sub(r'[^\w\s$%]', '', text)
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

# Keyword Extraction
all_words = [word for headline in df['headline'] for word in preprocess(headline)]
word_freq = Counter(all_words)

# Financial Phrase Detection
financial_phrases = {
    'price_target': r'price target',
    'upgrade': r'upgraded|raise target',
    'downgrade': r'downgraded|cut target',
    'merger': r'merger|acquisition|m&a',
    'earnings': r'earnings|eps|results',
    'dividend': r'dividend|payout',
    'fda': r'fda|approval|rejection',
    'analyst': r'analyst|rating|initiates',
    'guidance': r'guidance|forecast|outlook',
    'short': r'short seller|short interest'
}

phrase_counts = {}
for phrase_name, pattern in financial_phrases.items():
    phrase_counts[phrase_name] = df['headline'].str.contains(pattern, case=False).sum()

# Generate Analysis Table
keyword_stats = pd.DataFrame({
    'Top 10 Keywords': [word[0] for word in word_freq.most_common(10)],
    'Frequency': [word[1] for word in word_freq.most_common(10)],
    'Financial Phrases': list(financial_phrases.keys()),
    'Phrase Count': list(phrase_counts.values())
})

# Display formatted tables
print("TOP KEYWORDS AND FINANCIAL PHRASES ANALYSIS")
print("------------------------------------------")

display(keyword_stats[['Top 10 Keywords', 'Frequency']].head(10).style
    .set_properties(**{'text-align': 'left'})
    .set_table_styles([{
        'selector': 'th',
        'props': [('background-color', "#584141"), 
                 ('font-weight', 'bold')]
    }]))

print("\nFINANCIAL PHRASE FREQUENCY")
print("-------------------------")

display(pd.DataFrame.from_dict(phrase_counts, orient='index', columns=['Count']).sort_values('Count', ascending=False).style
    .set_properties(**{'text-align': 'left'})
    .set_table_styles([{
        'selector': 'th',
        'props': [('background-color', "#584141"), 
                 ('font-weight', 'bold')]
    }]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TOP KEYWORDS AND FINANCIAL PHRASES ANALYSIS
------------------------------------------


Unnamed: 0,Top 10 Keywords,Frequency
0,vs,162093
1,stocks,161868
2,est,140588
3,eps,128919
4,shares,114183
5,reports,108706
6,update,91680
7,earnings,87185
8,sales,79529
9,top,78649



FINANCIAL PHRASE FREQUENCY
-------------------------


Unnamed: 0,Count
earnings,231560
analyst,75613
price_target,47634
guidance,35430
merger,22396
dividend,21623
fda,15403
downgrade,3273
upgrade,3100
short,2958
