# Import Libraries

In [1]:
import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt_tab')

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raymondlow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/raymondlow/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Import and Preprocess Data

In [2]:
def load_text_file(file_path) -> list:
    """
    Load search terms for market signals or profile list from text file.

    Args:
        file_path (str): The path to the text file containing search terms/profiles, one per line.

    Returns:
        list: A list of search terms/profiles as strings.
    """
    full_file_path = f"../config/{file_path}"
    with open(full_file_path, "r") as file:
        return [line.strip() for line in file]
    

def preprocess(text: str) -> str:
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


def combine_metadata_text(row: pd.Series) -> str:
    combined_text_list = []

    # Append video text
    if row["text"] is not None and not pd.isnull(row["text"]):
        combined_text_list.append(row["text"])

    # Append video transcript
    if row["video_transcript"] is not None and not pd.isnull(row["video_transcript"]):
        combined_text_list.append(row["video_transcript"])

    # Append video hashtags
    if row["hashtags"] is not None and not pd.isnull(row["hashtags"]):
        hashtag_str = row["hashtags"].replace("'", '"')
        hashtag_list = json.loads(hashtag_str)
        hashtag_value_list = [d['name'] for d in hashtag_list]
        combined_text_list += hashtag_value_list 
        
    if combined_text_list == []:
        return ""
    else:
        return "\n".join(combined_text_list)

In [None]:
combined_video_metadata = pd.read_csv("../data/market-signals-finfluencer/profilesearch_video_metadata_identification.csv")

# Identify financial influencers
groundtruth_finfluencers = load_text_file("market_signals_finfluencer_profiles_finfluencers.txt")
print(f"Number of Financial Influencers: {len(groundtruth_finfluencers)}")

# Identify financial influencers whose primary focus is on stocks trading and equities, bonds and fixed income, and options trading and derivatives
identification_results = pd.read_csv("../data/market-signals-finfluencer/profile_metadata_post_identification.csv")
filtered_identification_results = identification_results[identification_results["Which of these areas of finance are the primary focus of the influencerâ€™s posts? - symbol"].str.contains('B1|B2|B3', na=False)]
relevant_finfluencers = [profile for profile in groundtruth_finfluencers if profile in filtered_identification_results["profile"].tolist()]
print(f"Number of Financial Influencers in B1, B2, and B3: {len(relevant_finfluencers)}")

finfluencer_video_metadata = combined_video_metadata[combined_video_metadata["profile"].isin(relevant_finfluencers)].reset_index(drop=True)
print(finfluencer_video_metadata.shape)
finfluencer_video_metadata.head()

Number of Financial Influencers: 200
Number of Financial Influencers in B1, B2, and B3: 81
(1990, 35)


Unnamed: 0,id,text,textLanguage,createTime,createTimeISO,authorMeta,musicMeta,webVideoUrl,mediaUrls,videoMeta,diggCount,shareCount,playCount,collectCount,commentCount,mentions,detailedMentions,hashtags,effectStickers,isSlideshow,isPinned,isSponsored,profile,fromProfileSection,isMuted,isAd,locationMeta,slideshowImageLinks,extractionTime,video_filename,video_transcript,profile_id,url,searchQuery,error
0,7.464681e+18,What to know about the market selloff today: \...,en,1738006000.0,2025-01-27T19:34:17.000Z,"{'id': '6670082773049147397', 'name': 'madymil...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@madymills/video/746468...,[],"{'height': 1024, 'width': 576, 'duration': 154...",240.0,23.0,7022.0,41.0,30.0,[],[],"[{'name': 'nvidia'}, {'name': 'stocks'}, {'nam...",[],False,False,False,madymills,videos,,,"{'address': 'New York, NY, United States', 'ci...",,2025-02-18 19:39:57.290608+00:00,7464680539543456768.mp4,NVIDIA is having one of its biggest sell-offs ...,,,,
1,7.447214e+18,Replying to @samsam06501 how I invested $130k ...,en,1733940000.0,2024-12-11T17:52:57.000Z,"{'id': '7110061630651073542', 'name': 'mattsho...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@mattshoss/video/744721...,[],"{'height': 1024, 'width': 576, 'duration': 48,...",501.0,79.0,15400.0,81.0,35.0,['@samsam06501'],"[{'id': '6970716142583809029', 'name': 'samsam...","[{'name': ''}, {'name': 'investing'}, {'name':...",[],False,False,False,mattshoss,videos,,,,,2025-02-18 19:39:57.290608+00:00,7447213753411750912.mp4,"As a 21 year old student, I've been able to bu...",,,,
2,7.435318e+18,How Iâ€™m changing my investments after the resu...,en,1731170000.0,2024-11-09T16:33:08.000Z,"{'id': '6942589474652341254', 'name': 'stocksa...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@stocksandsavings/video...,[],"{'height': 1024, 'width': 576, 'duration': 15,...",31.0,4.0,1405.0,0.0,1.0,[],[],[],[],False,False,False,stocksandsavings,videos,,,"{'address': 'Europe', 'city': '', 'cityCode': ...",,2025-02-18 19:39:57.290608+00:00,7435318446168050688.mp4,Make avocado toast with me while I tell you ho...,,,,
3,7.457633e+18,3 stocks Iâ€™m buying every month in 2025!ðŸ“ˆðŸ’°\n\n...,en,1736366000.0,2025-01-08T19:46:03.000Z,"{'id': '7110061630651073542', 'name': 'mattsho...",{'musicName': 'original sound - Michael Prince...,https://www.tiktok.com/@mattshoss/video/745763...,[],"{'height': 1024, 'width': 576, 'duration': 42,...",7945.0,1277.0,230900.0,4966.0,259.0,[],[],"[{'name': 'investing'}, {'name': 'finance'}, {...",[],False,False,False,mattshoss,videos,,,,,2025-02-18 19:39:57.290608+00:00,7457633279009623040.mp4,These are the three stocks that I'm buying eve...,,,,
4,7.24721e+18,HOW TO FIND THE FAIR VALUE GAP IF YOUâ€™RE LOOKI...,en,1687373000.0,2023-06-21T18:36:18.000Z,"{'id': '6953206871176381446', 'name': 'officia...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@officialangiebassett/v...,[],"{'height': 854, 'width': 480, 'duration': 30, ...",6.0,1.0,1625.0,0.0,0.0,[],[],[],[],False,False,False,officialangiebassett,videos,,,,,2025-02-18 19:39:57.290608+00:00,7247210020742384640.mp4,how you find the fair value gap if you're look...,,,,


In [4]:
# Combine the text columns: 'text', 'hashtags', and 'video_transcript'
finfluencer_video_metadata['combined_text'] = finfluencer_video_metadata.apply(combine_metadata_text, axis=1)

# Preprocess the text: lowercase and remove punctuation
finfluencer_video_metadata['combined_text'] = finfluencer_video_metadata['combined_text'].apply(preprocess)

finfluencer_video_metadata.head()

Unnamed: 0,id,text,textLanguage,createTime,createTimeISO,authorMeta,musicMeta,webVideoUrl,mediaUrls,videoMeta,diggCount,shareCount,playCount,collectCount,commentCount,mentions,detailedMentions,hashtags,effectStickers,isSlideshow,isPinned,isSponsored,profile,fromProfileSection,isMuted,isAd,locationMeta,slideshowImageLinks,extractionTime,video_filename,video_transcript,profile_id,url,searchQuery,error,combined_text
0,7.464681e+18,What to know about the market selloff today: \...,en,1738006000.0,2025-01-27T19:34:17.000Z,"{'id': '6670082773049147397', 'name': 'madymil...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@madymills/video/746468...,[],"{'height': 1024, 'width': 576, 'duration': 154...",240.0,23.0,7022.0,41.0,30.0,[],[],"[{'name': 'nvidia'}, {'name': 'stocks'}, {'nam...",[],False,False,False,madymills,videos,,,"{'address': 'New York, NY, United States', 'ci...",,2025-02-18 19:39:57.290608+00:00,7464680539543456768.mp4,NVIDIA is having one of its biggest sell-offs ...,,,,,what to know about the market selloff today \n...
1,7.447214e+18,Replying to @samsam06501 how I invested $130k ...,en,1733940000.0,2024-12-11T17:52:57.000Z,"{'id': '7110061630651073542', 'name': 'mattsho...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@mattshoss/video/744721...,[],"{'height': 1024, 'width': 576, 'duration': 48,...",501.0,79.0,15400.0,81.0,35.0,['@samsam06501'],"[{'id': '6970716142583809029', 'name': 'samsam...","[{'name': ''}, {'name': 'investing'}, {'name':...",[],False,False,False,mattshoss,videos,,,,,2025-02-18 19:39:57.290608+00:00,7447213753411750912.mp4,"As a 21 year old student, I've been able to bu...",,,,,replying to samsam06501 how i invested 130k 2...
2,7.435318e+18,How Iâ€™m changing my investments after the resu...,en,1731170000.0,2024-11-09T16:33:08.000Z,"{'id': '6942589474652341254', 'name': 'stocksa...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@stocksandsavings/video...,[],"{'height': 1024, 'width': 576, 'duration': 15,...",31.0,4.0,1405.0,0.0,1.0,[],[],[],[],False,False,False,stocksandsavings,videos,,,"{'address': 'Europe', 'city': '', 'cityCode': ...",,2025-02-18 19:39:57.290608+00:00,7435318446168050688.mp4,Make avocado toast with me while I tell you ho...,,,,,how iâ€™m changing my investments after the resu...
3,7.457633e+18,3 stocks Iâ€™m buying every month in 2025!ðŸ“ˆðŸ’°\n\n...,en,1736366000.0,2025-01-08T19:46:03.000Z,"{'id': '7110061630651073542', 'name': 'mattsho...",{'musicName': 'original sound - Michael Prince...,https://www.tiktok.com/@mattshoss/video/745763...,[],"{'height': 1024, 'width': 576, 'duration': 42,...",7945.0,1277.0,230900.0,4966.0,259.0,[],[],"[{'name': 'investing'}, {'name': 'finance'}, {...",[],False,False,False,mattshoss,videos,,,,,2025-02-18 19:39:57.290608+00:00,7457633279009623040.mp4,These are the three stocks that I'm buying eve...,,,,,3 stocks iâ€™m buying every month in 2025ðŸ“ˆðŸ’°\n\ni...
4,7.24721e+18,HOW TO FIND THE FAIR VALUE GAP IF YOUâ€™RE LOOKI...,en,1687373000.0,2023-06-21T18:36:18.000Z,"{'id': '6953206871176381446', 'name': 'officia...","{'musicName': 'original sound', 'musicAuthor':...",https://www.tiktok.com/@officialangiebassett/v...,[],"{'height': 854, 'width': 480, 'duration': 30, ...",6.0,1.0,1625.0,0.0,0.0,[],[],[],[],False,False,False,officialangiebassett,videos,,,,,2025-02-18 19:39:57.290608+00:00,7247210020742384640.mp4,how you find the fair value gap if you're look...,,,,,how to find the fair value gap if youâ€™re looki...


# TF-IDF Analysis with N-grams

In [5]:
# Prepare stop words list
stop_words = stopwords.words('english')

# Initialize TfidfVectorizer with n-gram range 1 to 3
vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(finfluencer_video_metadata['combined_text'])
feature_names = vectorizer.get_feature_names_out()

# Sum the TF-IDF scores for each n-gram across all documents
tfidf_sum = np.sum(tfidf_matrix.toarray(), axis=0)
keywords_scores = list(zip(feature_names, tfidf_sum))

# Separate the keywords by their n-gram length
unigrams = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 1]
bigrams  = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 2]
trigrams = [(kw, score) for kw, score in keywords_scores if len(kw.split()) == 3]

# Sort each list in descending order based on the aggregated score
unigrams.sort(key=lambda x: x[1], reverse=True)
bigrams.sort(key=lambda x: x[1], reverse=True)
trigrams.sort(key=lambda x: x[1], reverse=True)

# Define how many top results to show
top_n = 50
top_unigrams = unigrams[:top_n]
top_bigrams  = bigrams[:top_n]
top_trigrams = trigrams[:top_n]

print("Top 50 Unigrams (TF-IDF):")
for kw, score in top_unigrams:
    print(f"{kw}: {score:.4f}")

print("\nTop 50 Bigrams (TF-IDF):")
for kw, score in top_bigrams:
    print(f"{kw}: {score:.4f}")

print("\nTop 50 Trigrams (TF-IDF):")
for kw, score in top_trigrams:
    print(f"{kw}: {score:.4f}")

Top 50 Unigrams (TF-IDF):
trading: 38.9322
forex: 34.2332
fyp: 26.3057
que: 18.9036
money: 18.8624
forextrading: 18.2575
de: 17.9669
stocks: 17.6291
like: 16.9370
im: 16.7295
trade: 16.3196
crypto: 15.2794
trader: 15.1505
motivation: 14.8907
market: 14.7559
going: 14.4417
one: 14.3944
youre: 14.2578
know: 13.5979
price: 13.3303
forextrader: 12.9381
dont: 12.6244
daytrading: 12.6237
time: 12.5106
investing: 12.5069
see: 12.1263
make: 11.9769
get: 11.9655
want: 11.8349
go: 11.5385
strategy: 10.7893
la: 10.7414
foryou: 10.6458
day: 10.6039
right: 10.5477
viral: 10.0770
stock: 9.8930
got: 9.8008
thats: 9.7393
en: 9.6397
traders: 9.6179
buy: 9.4651
take: 9.2281
daytrader: 9.1450
bitcoin: 8.7148
people: 8.6081
forexlifestyle: 8.4440
year: 8.3891
tradingforex: 8.1535
account: 8.1464

Top 50 Bigrams (TF-IDF):
trading forex: 10.1497
forex trading: 8.2755
fyp motivation: 8.1184
motivation forex: 7.5150
link bio: 5.4071
forex forextrading: 4.9776
forex crypto: 4.7830
stock market: 4.7485
thanks w