# Import Libraries

In [None]:
import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt_tab')

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Import and Preprocess Data

In [None]:
def load_text_file(file_path) -> list:
    """
    Load search terms for market signals or profile list from text file.

    Args:
        file_path (str): The path to the text file containing search terms/profiles, one per line.

    Returns:
        list: A list of search terms/profiles as strings.
    """
    full_file_path = f"../config/{file_path}"
    with open(full_file_path, "r") as file:
        return [line.strip() for line in file]
    

def preprocess(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


def combine_metadata_text(row: pd.Series) -> str:
    combined_text_list = []

    # Append video text
    if row["text"] is not None and not pd.isnull(row["text"]):
        combined_text_list.append(row["text"])

    # Append video transcript
    if row["video_transcript"] is not None and not pd.isnull(row["video_transcript"]):
        combined_text_list.append(row["video_transcript"])

    # Append video hashtags
    if row["hashtags"] is not None and not pd.isnull(row["hashtags"]):
        hashtag_str = row["hashtags"].replace("'", '"')
        hashtag_list = json.loads(hashtag_str)
        hashtag_value_list = [d['name'] for d in hashtag_list]
        combined_text_list += hashtag_value_list 
        
    if combined_text_list == []:
        return ""
    else:
        return "\n".join(combined_text_list)

In [None]:
combined_video_metadata = pd.read_csv("../data/market-signals-finfluencer/profilesearch_video_metadata_identification.csv")

# Identify financial influencers
groundtruth_finfluencers = load_text_file("market_signals_finfluencer_profiles_finfluencers.txt")
print(f"Number of Financial Influencers: {len(groundtruth_finfluencers)}")

# Identify financial influencers whose primary focus is on stocks trading and equities, bonds and fixed income, and options trading and derivatives
identification_results = pd.read_csv("../data/market-signals-finfluencer/profile_metadata_post_identification.csv")
filtered_identification_results = identification_results[identification_results["Which of these areas of finance are the primary focus of the influencerâ€™s posts? - symbol"].str.contains('B1|B2|B3', na=False)]
relevant_finfluencers = [profile for profile in groundtruth_finfluencers if profile in filtered_identification_results["profile"].tolist()]
print(f"Number of Financial Influencers in B1, B2, and B3: {len(relevant_finfluencers)}")

finfluencer_video_metadata = combined_video_metadata[combined_video_metadata["profile"].isin(relevant_finfluencers)].reset_index(drop=True)
print(finfluencer_video_metadata.shape)
finfluencer_video_metadata.head()

In [None]:
# Combine the text columns: 'text', 'hashtags', and 'video_transcript'
finfluencer_video_metadata['combined_text'] = finfluencer_video_metadata.apply(combine_metadata_text, axis=1)

# Preprocess the text: lowercase and remove punctuation
finfluencer_video_metadata['combined_text'] = finfluencer_video_metadata['combined_text'].apply(preprocess)

finfluencer_video_metadata.head()

# TF-IDF Analysis with N-grams

In [None]:
# Prepare stop words list
stop_words = stopwords.words('english')

# Initialize TfidfVectorizer with n-gram range 1 to 3
vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(finfluencer_video_metadata['combined_text'])
feature_names = vectorizer.get_feature_names_out()

# Sum the TF-IDF scores for each n-gram across all documents
tfidf_sum = np.sum(tfidf_matrix.toarray(), axis=0)
keywords_scores = list(zip(feature_names, tfidf_sum))

# Separate the keywords by their n-gram length
unigrams = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 1]
bigrams  = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 2]
trigrams = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 3]

# Sort each list in descending order based on the aggregated score
unigrams.sort(key=lambda x: x[1], reverse=True)
bigrams.sort(key=lambda x: x[1], reverse=True)
trigrams.sort(key=lambda x: x[1], reverse=True)

# Define how many top results to show
top_n = 50
top_unigrams = unigrams[:top_n]
top_bigrams  = bigrams[:top_n]
top_trigrams = trigrams[:top_n]

print("Top 50 Unigrams (TF-IDF):")
for kw, score in top_unigrams:
    print(f"{kw}: {score:.4f}")

print("\nTop 50 Bigrams (TF-IDF):")
for kw, score in top_bigrams:
    print(f"{kw}: {score:.4f}")

print("\nTop 50 Trigrams (TF-IDF):")
for kw, score in top_trigrams:
    print(f"{kw}: {score:.4f}")

# Key Word Analysis of Financial Influencers vs Non-Financial Influencers

In [None]:
def load_text_file(file_path) -> list:
    """
    Load search terms for market signals or profile list from text file.

    Args:
        file_path (str): The path to the text file containing search terms/profiles, one per line.

    Returns:
        list: A list of search terms/profiles as strings.
    """
    full_file_path = f"../config/{file_path}"
    with open(full_file_path, "r") as file:
        return [line.strip() for line in file]

In [None]:
video_metadata = pd.read_csv("../data/market-signals-finfluencer/profilesearch_video_metadata_identification.csv")

finfluencer_list = load_text_file("market_signals_finfluencer_profiles_finfluencers.txt")
nonfinfluencer_list = load_text_file("market_signals_finfluencer_profiles_nonfinfluencers.txt")

video_metadata["Finfluencer"] = video_metadata["profile"].apply(lambda x: 1 if x in finfluencer_list else 0)

finfluencer_video_metadata = video_metadata[video_metadata["Finfluencer"] == 1].reset_index(drop=True)
nonfinfluencer_video_metadata = video_metadata[video_metadata["Finfluencer"] != 1].reset_index(drop=True)

print(video_metadata.shape)
video_metadata.head()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download("stopwords")

# Get the list of English stop words
stop_words = set(stopwords.words("english"))

# Function to remove stop words from text
def remove_stopwords(text: str) -> str:
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)


def generate_word_cloud(combined_text: str, title: str) -> None:
    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(combined_text)

    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()


# Generate word cloud for financial influencers
finfluencer_video_text = " ".join(finfluencer_video_metadata["text"].dropna())
finfluencer_transcripts = " ".join(finfluencer_video_metadata["video_transcript"].dropna())
combined_finfluencer_text = remove_stopwords(finfluencer_video_text + " " + finfluencer_transcripts)
generate_word_cloud(combined_finfluencer_text, "Word Cloud for Financial Influencers")

# Generate word cloyd for non-financial influencers
nonfinfluencer_video_text = " ".join(nonfinfluencer_video_metadata["text"].dropna())
nonfinfluencer_transcripts = " ".join(nonfinfluencer_video_metadata["video_transcript"].dropna())
combined_nonfinfluencer_text = remove_stopwords(nonfinfluencer_video_text + " " + nonfinfluencer_transcripts)
generate_word_cloud(combined_nonfinfluencer_text, "Word Cloud for Non-Financial Influencers")
