# Import Libraries

In [None]:
import sys
import os
from pathlib import Path

# Set working directory to project root, if not done already.
project_root = Path('/Users/raymondlow/Documents/talking-to-machines/ai-population').resolve()
os.chdir(project_root)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Set __package__ so that relative imports work.
__package__ = "ai_population.analysis"

import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('punkt_tab')

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

PROJECT_NAME = "market-signals-x"
EXECUTION_DATE = "ground-truth"

# Import and Preprocess Data

In [None]:
def preprocess(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


def combine_metadata_text(row: pd.Series) -> str:
    combined_text_list = []

    # Append tweet text
    if row["text"] is not None and not pd.isnull(row["text"]):
        combined_text_list.append(row["text"])

    if row["hashtags"] is not None and not pd.isnull(row["hashtags"]):
        combined_text_list += row["hashtags"].split(", ")
        
    if combined_text_list == []:
        return ""
    else:
        return preprocess("\n".join(combined_text_list))

In [None]:
# Identify financial influencers
ground_truth_profiles = pd.read_csv(os.path.join("ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_profile_list.csv"))
ground_truth_finfluencers = ground_truth_profiles[ground_truth_profiles["finfluencer"] == "Yes"]["account_id"].tolist()
print(f"Number of Financial Influencers: {len(ground_truth_finfluencers)}")
ground_truth_nonfinfluencers = ground_truth_profiles[ground_truth_profiles["finfluencer"] == "No"]["account_id"].tolist()
print(f"Number of Non-Financial Influencers: {len(ground_truth_nonfinfluencers)}")

# Identify financial influencers whose primary focus is on stocks trading and equities, bonds and fixed income, and options trading and derivatives
post_data = pd.read_csv(os.path.join("ai_population/data", PROJECT_NAME, EXECUTION_DATE, "ground_truth_profile_posts.csv"))
finfluencer_post_data = post_data[post_data["account_id"].isin(ground_truth_finfluencers)].reset_index(drop=True)
nonfinfluencer_post_data = post_data[post_data["account_id"].isin(ground_truth_nonfinfluencers)].reset_index(drop=True)

print(finfluencer_post_data.shape)
print(nonfinfluencer_post_data.shape)
finfluencer_post_data.head()

In [None]:
finfluencer_post_data['combined_text'] = finfluencer_post_data.apply(combine_metadata_text, axis=1)
nonfinfluencer_post_data['combined_text'] = nonfinfluencer_post_data.apply(combine_metadata_text, axis=1)
finfluencer_post_data.head()

# TF-IDF Analysis with N-grams

In [None]:
def perform_tfidf_analysis(video_data: pd.DataFrame) -> None:
    # Prepare stop words list
    stop_words = stopwords.words('english')

    # Initialize TfidfVectorizer with n-gram range 1 to 3
    vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))
    tfidf_matrix = vectorizer.fit_transform(video_data['combined_text'])
    feature_names = vectorizer.get_feature_names_out()

    # Sum the TF-IDF scores for each n-gram across all documents
    tfidf_sum = np.sum(tfidf_matrix.toarray(), axis=0)
    keywords_scores = list(zip(feature_names, tfidf_sum))

    # Separate the keywords by their n-gram length
    unigrams = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 1]
    bigrams  = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 2]
    trigrams = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 3]

    # Sort each list in descending order based on the aggregated score
    unigrams.sort(key=lambda x: x[1], reverse=True)
    bigrams.sort(key=lambda x: x[1], reverse=True)
    trigrams.sort(key=lambda x: x[1], reverse=True)

    # Define how many top results to show
    top_n = 50
    top_unigrams = unigrams[:top_n]
    top_bigrams  = bigrams[:top_n]
    top_trigrams = trigrams[:top_n]

    return top_unigrams, top_bigrams, top_trigrams


def remove_overlapping_keywords(finfluencer_keywords: dict, nonfinfluencer_keywords: dict) -> dict:
    """
    Remove overlapping keywords from the finfluencer keywords.
    """
    finfluencer_keywords_set = set([kw for kw, _ in finfluencer_keywords])
    nonfinfluencer_keywords_set = set([kw for kw, _ in nonfinfluencer_keywords])

    # Find overlapping keywords
    overlapping_keywords = finfluencer_keywords_set.intersection(nonfinfluencer_keywords_set)
    print(f"Overlapping Keywords: {overlapping_keywords}")

    # Remove overlapping keywords from finfluencer keywords
    filtered_finfluencer_keywords = [(kw, score) for kw, score in finfluencer_keywords if kw not in overlapping_keywords]

    return filtered_finfluencer_keywords

In [None]:
top_finfluencer_unigrams, top_finfluencer_bigrams, top_finfluencer_trigrams = perform_tfidf_analysis(finfluencer_post_data)
top_nonfinfluencer_unigrams, top_nonfinfluencer_bigrams, top_nonfinfluencer_trigrams = perform_tfidf_analysis(nonfinfluencer_post_data)

processed_unigrams = remove_overlapping_keywords(top_finfluencer_unigrams, top_nonfinfluencer_unigrams)
processed_bigrams = remove_overlapping_keywords(top_finfluencer_bigrams, top_nonfinfluencer_bigrams)
processed_trigrams = remove_overlapping_keywords(top_finfluencer_trigrams, top_nonfinfluencer_trigrams)


print("Top 50 Unigrams (TF-IDF):")
for kw, score in processed_unigrams:
    print(f"{kw}: {score:.4f}")

print("\nTop 50 Bigrams (TF-IDF):")
for kw, score in processed_bigrams:
    print(f"{kw}: {score:.4f}")

print("\nTop 50 Trigrams (TF-IDF):")
for kw, score in processed_trigrams:
    print(f"{kw}: {score:.4f}")

# Key Word Analysis of Financial Influencers vs Non-Financial Influencers

In [None]:
# Get the list of English stop words
stop_words = set(stopwords.words("english"))

# Function to remove stop words from text
def remove_stopwords(text: str) -> str:
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)


def generate_word_cloud(combined_text: str, title: str) -> None:
    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(combined_text)

    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()

# Generate word cloud for financial influencers
finfluencer_text = "\n".join(finfluencer_post_data["combined_text"].dropna())
generate_word_cloud(finfluencer_text, "Word Cloud for Financial Influencers")

# Generate word cloyd for non-financial influencers
nonfinfluencer_text = "\n".join(nonfinfluencer_post_data["combined_text"].dropna())
generate_word_cloud(nonfinfluencer_text, "Word Cloud for Non-Financial Influencers")
