In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


### Download and Load in the datasets

In [None]:
import pandas as pd
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="notmooodoo9/TrumpsTruthSocialPosts", 
    repo_type="dataset", 
    local_dir="datasets/trump_posts_and_comments", 
    local_dir_use_symlinks=False)

snapshot_download(
    repo_id="FamilyLinks/btc-price-1m-2017-2025", 
    repo_type="dataset", 
    local_dir="datasets/btc_prices", 
    local_dir_use_symlinks=False)

btc_df = pd.read_parquet("datasets/btc_prices/BTC_Raw_Micro_Macro_1m.parquet")
posts_df = pd.read_csv("datasets/trump_posts_and_comments/truthsocial.posts[Trump-FROM-10-8-25].csv")
reader = pd.read_csv("datasets/trump_posts_and_comments/truthsocial.comments[Trump-FROM-10-8-25].csv", chunksize=100000)

# There are millions of comments, so we need to load them in chunks to avoid crashing
# Contains 31.8Million Comments, and over 18000 Posts all By Trump
chunks = []
for i, chunk in enumerate(reader):
    chunks.append(chunk)
    # if i >= 10:  # Limit to first 1Million comments to avoid crashing my laptop
    #     break

comments_df = pd.concat(chunks, ignore_index=True)
print(f"Loaded {len(comments_df)} relevant comments.")
print(f"Loaded {len(posts_df)} relevant posts.")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 5 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:00<00:00, 10650.85it/s]
Fetching 3 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:00<00:00, 2478.90it/s]


Loaded 1100000 relevant comments.
Loaded 18476 relevant posts.


### Normalize the datset to use UTC timezone

In [3]:
# Convert BTC timestamp, its already in UTC
btc_df['timestamp_utc'] = pd.to_datetime(btc_df['date'])
btc_df['timestamp_utc'] = btc_df['timestamp_utc'].dt.tz_localize('UTC')

# Convert Truth social posts ids to UTC timestamps
posts_df['timestamp_utc'] = pd.to_datetime(posts_df['_id'].apply(lambda x: x >> 16), unit='ms').dt.tz_localize('UTC')
posts_df = posts_df.sort_values('timestamp_utc')

# Convert Truth social comments ids to UTC timestamps
comments_df['timestamp_utc'] = pd.to_datetime(comments_df['_id'].apply(lambda x: x >> 16), unit='ms').dt.tz_localize('UTC')
comments_df = comments_df.sort_values('timestamp_utc')

### Connect multiple levels of comments to original post

In [4]:
from IPython.display import display

# Function to find the root Trump post for a given comment
# Because a comment can reply to another comment, we need to traverse up the chain
post_ids = set(posts_df['_id'])
comment_map = comments_df.set_index('_id')['reply_to'].to_dict()

def get_root_post(reply_id):
    current_id = reply_id
    for _ in range(20): # Limit to 20 reply levels
        if current_id in post_ids:
            return current_id # Found original Trump Post!
        if current_id in comment_map:
            current_id = comment_map[current_id] 
    return None

comments_df['trump_post_id'] = comments_df['reply_to'].apply(get_root_post)

print("BTC Data Head:")
display(btc_df[['timestamp_utc', 'open']].head())
print("\nPosts Data Head:")
display(posts_df[['timestamp_utc', 'text']].head())
print("\nComments Data Head:")
display(comments_df[['timestamp_utc', 'text', 'trump_post_id']].head())

BTC Data Head:


Unnamed: 0,timestamp_utc,open
0,2017-08-17 04:00:00+00:00,4261.48
1,2017-08-17 04:01:00+00:00,4261.48
2,2017-08-17 04:02:00+00:00,4280.56
3,2017-08-17 04:03:00+00:00,4261.48
4,2017-08-17 04:04:00+00:00,4261.48



Posts Data Head:


Unnamed: 0,timestamp_utc,text
18475,2022-02-14 15:54:32.523000+00:00,Get Ready! Your favorite President will see yo...
18474,2022-04-28 21:29:28.112000+00:00,Iâ€™M BACK! #COVFEFE
18473,2022-04-29 22:45:26.489000+00:00,Thank you to all of the GREAT and BEAUTIFUL Am...
18472,2022-04-30 12:37:05.097000+00:00,RT @catturd2Joe Biden is going to fly gas guzz...
18471,2022-04-30 12:37:13.338000+00:00,@melmul



Comments Data Head:


Unnamed: 0,timestamp_utc,text,trump_post_id
1095658,2025-09-10 20:41:05.246000+00:00,I cannot believe we lost Charlie Kirk today. M...,1.151819e+17
1095664,2025-09-10 20:41:06.051000+00:00,We keep standing up louder and bolder than eve...,1.151819e+17
1095667,2025-09-10 20:41:07.554000+00:00,ðŸ’”ðŸ’”ðŸ’”ðŸ’”ðŸ’”ðŸ’”,1.151819e+17
1095669,2025-09-10 20:41:14.157000+00:00,Crying ðŸ˜­,1.151819e+17
1095687,2025-09-10 20:41:14.709000+00:00,Rest in peace I'm so sad,1.151819e+17


### Analyse BTC Price Movement for every trump posts

In [5]:
price_windows = {
    'return_30m': 30,
    'return_1h': 60,
    'return_2h': 120,
    'return_4h': 240
}
def get_multi_window_returns(post_time):
    results = {}
    
    # Find start price
    start_idx = btc_df['timestamp_utc'].searchsorted(post_time)
    if start_idx >= len(btc_df):
        return pd.Series({k: None for k in price_windows})
        
    start_price = btc_df.iloc[start_idx]['open']
    
    for name, minutes in price_windows.items():
        end_time = post_time + pd.Timedelta(minutes=minutes)
        end_idx = btc_df['timestamp_utc'].searchsorted(end_time)
        
        if end_idx < len(btc_df):
            end_price = btc_df.iloc[end_idx]['open']
            results[name] = (end_price - start_price) / start_price
        else:
            results[name] = None
            
    return pd.Series(results)

# Calculate returns for multiple windows, adding them as new columns in trump posts dataframe
posts_df[list(price_windows.keys())] = posts_df['timestamp_utc'].apply(get_multi_window_returns)

print("BTC price returns windows from posts head:")
display(posts_df.head())

BTC price returns windows from posts head:


Unnamed: 0,_id,owner,text,timestamp_utc,return_30m,return_1h,return_2h,return_4h
18475,107797156496908384,107780257626128496,Get Ready! Your favorite President will see yo...,2022-02-14 15:54:32.523000+00:00,0.003583,0.001959,-0.000141,-0.011884
18474,108211822140637680,107780257626128496,Iâ€™M BACK! #COVFEFE,2022-04-28 21:29:28.112000+00:00,-0.000361,-0.004347,-0.002254,0.000982
18473,108217783188791696,107780257626128496,Thank you to all of the GREAT and BEAUTIFUL Am...,2022-04-29 22:45:26.489000+00:00,0.002005,0.002299,-0.000149,0.003185
18472,108221053343991936,107780257626128496,RT @catturd2Joe Biden is going to fly gas guzz...,2022-04-30 12:37:05.097000+00:00,0.000315,-0.000258,-0.00532,0.000634
18471,108221053884053056,107780257626128496,@melmul,2022-04-30 12:37:13.338000+00:00,0.000315,-0.000258,-0.00532,0.000634


### Sentiment analysis

In [6]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    # Sentiment ranges from -1 (most negative) to +1 (most positive)
    if pd.isna(text) or text == "":
        return 0.0
    return sid.polarity_scores(str(text))['compound']

# Apply sentiment analysis to posts and comments
posts_df['sentiment_vader'] = posts_df['text'].apply(get_vader_sentiment)
comments_df['sentiment_vader'] = comments_df['text'].apply(get_vader_sentiment)

# Merge post timestamp to comments to calculate time delta
comments_with_post_time = comments_df.merge(
    posts_df[['_id', 'timestamp_utc']], 
    left_on='trump_post_id', 
    right_on='_id', 
    suffixes=('', '_post') # This creates 'timestamp_utc_post'
)

# Calculate time difference in minutes between comment and its Trump post
comments_with_post_time['minutes_after_post'] = (
    comments_with_post_time['timestamp_utc'] - comments_with_post_time['timestamp_utc_post']
).dt.total_seconds() / 60

# Calculate average comment sentiment per Trump post for different time windows
minutes_window = [30, 60, 120, 240] 
for minutes in minutes_window:
    window_mask = (comments_with_post_time['minutes_after_post'] >= 0) & \
                  (comments_with_post_time['minutes_after_post'] <= minutes)
    
    window_comments = comments_with_post_time[window_mask]
    
    avg_sentiment = window_comments.groupby('trump_post_id')['sentiment_vader'].mean()
    avg_sentiment.name = f'avg_comment_sentiment_{minutes}m'
    
    posts_df = posts_df.merge(avg_sentiment, left_on='_id', right_index=True, how='left')

print("Post & Comment sentiments head:")
display(posts_df.head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/valdemar/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Post & Comment sentiments head:


Unnamed: 0,_id,owner,text,timestamp_utc,return_30m,return_1h,return_2h,return_4h,sentiment_vader,avg_comment_sentiment_30m,avg_comment_sentiment_60m,avg_comment_sentiment_120m,avg_comment_sentiment_240m
18475,107797156496908384,107780257626128496,Get Ready! Your favorite President will see yo...,2022-02-14 15:54:32.523000+00:00,0.003583,0.001959,-0.000141,-0.011884,0.7256,,,,
18474,108211822140637680,107780257626128496,Iâ€™M BACK! #COVFEFE,2022-04-28 21:29:28.112000+00:00,-0.000361,-0.004347,-0.002254,0.000982,0.0,,,,
18473,108217783188791696,107780257626128496,Thank you to all of the GREAT and BEAUTIFUL Am...,2022-04-29 22:45:26.489000+00:00,0.002005,0.002299,-0.000149,0.003185,0.96,,,,
18472,108221053343991936,107780257626128496,RT @catturd2Joe Biden is going to fly gas guzz...,2022-04-30 12:37:05.097000+00:00,0.000315,-0.000258,-0.00532,0.000634,-0.6428,,,,
18471,108221053884053056,107780257626128496,@melmul,2022-04-30 12:37:13.338000+00:00,0.000315,-0.000258,-0.00532,0.000634,0.0,,,,


### Keywords analysis

In [None]:
keywords = ['btc', "bitcoin", 'crypto', 'tariff', 'dollar', 'usa', "china", "market", "economy", "inflation",
            "coin", "currency", "blockchain", "invest", "fed", "interest rate", "bull", "bear", 
            "stock", "trade", "mining", "wallet", "exchange", "decentralized", "defi", "nft", "token", 
            "halving", "fiat", "payment", "volatility", "wealth", "financial"]
results = []

# BTC return after posts Baseline 
baseline_30m = posts_df['return_30m'].mean() * 100
baseline_1h = posts_df['return_1h'].mean() * 100
baseline_2h = posts_df['return_2h'].mean() * 100
baseline_4h = posts_df['return_4h'].mean() * 100

# Trump Baseline Sentiment 
baseline_sentiment = posts_df['sentiment_vader'].mean()

print(f"Baseline Returns (All Posts): 30m={baseline_30m:.4f}%, 1h={baseline_1h:.4f}%, 2h={baseline_2h:.4f}%, 4h={baseline_4h:.4f}%")
print(f"Baseline Sentiment (All Posts): {baseline_sentiment:.4f}")

for kw in keywords:
    # Filter Posts containing the keywords
    subset = posts_df[posts_df['text'].str.lower().str.contains(kw, na=False)]

    if not subset.empty:
        # Average BTC returns after posts with the keyword
        avg_30m = subset['return_30m'].mean() * 100
        avg_1h = subset['return_1h'].mean() * 100
        avg_2h = subset['return_2h'].mean() * 100
        avg_4h = subset['return_4h'].mean() * 100
        
        # The average sentiment for the posts with the keyword
        avg_post_sentiment = subset['sentiment_vader'].mean()

        # How many comments does a post with this keyword usually get?
        kw_comments_count = len(comments_df[comments_df['trump_post_id'].isin(subset['_id'])])
        avg_comments_per_post = kw_comments_count / len(subset)
        
        # Average Comment Sentiment for posts containing the keyword
        all_com_senti_30m = subset['avg_comment_sentiment_30m'].mean()
        all_com_senti_1h = subset['avg_comment_sentiment_60m'].mean()
        all_com_senti_2h = subset['avg_comment_sentiment_120m'].mean()
        
        results.append({
            'keyword': kw,
            'post_count': len(subset),
            'avg_comments': round(avg_comments_per_post, 1), 

            # Return Metrics
            '30m': avg_30m,
            '1h_%': avg_1h,
            '2h_%': avg_2h,
            '4h_%': avg_4h,

            # Comparison vs Baseline
            '30m_vs_base': avg_30m - baseline_30m,
            '1h_vs_base': avg_1h - baseline_1h,
            '2h_vs_base': avg_2h - baseline_2h,
            '4h_vs_base': avg_4h - baseline_4h,
            
            # Post Sentiment Metrics
            'post_senti': avg_post_sentiment,
            'senti_vs_base': avg_post_sentiment - baseline_sentiment, # Is this topic happier than usual?
            
            # Average Comment Sentiment
            'all_com_30m': all_com_senti_30m,
            'all_com_1h': all_com_senti_1h,
            'all_com_2h': all_com_senti_2h      
        })

# Display Results, sort them by the 1h btc return
results_df = pd.DataFrame(results).sort_values('1h_%', ascending=False) 
display(results_df.round(4))

# Example Posts for keyword bitcoin & Comments on that specific post id 
example_kw = 'bitcoin'
print(f"\n--- Example Posts with '{example_kw}' ---")
ex_posts = posts_df[posts_df['text'].str.lower().str.contains(example_kw, na=False)].head(3)
ex_comments = comments_df[comments_df['trump_post_id'] == ex_posts.iloc[0]['_id']].head(3)
display(ex_posts)
display(ex_comments)

'Baseline Returns (All Posts): 30m=0.0065%, 1h=0.0143%, 2h=0.0216%, 4h=0.0420%'

'Baseline Sentiment (All Posts): 0.1096'

Unnamed: 0,keyword,post_count,avg_comments,1h_%,1h_vs_base,post_senti,senti_vs_base,all_com_1h,kw_com_1h,all_com_2h,kw_com_2h
29,volatility,18,3727.3,0.2207,0.2064,0.4456,0.336,0.0472,0.0073,0.0351,-0.0262
0,btc,22,5332.1,0.1393,0.125,0.2502,0.1406,0.0229,0.0614,0.016,0.0761
23,decentralized,17,6844.2,0.1367,0.1224,-0.207,-0.3166,-0.0338,-0.5318,-0.037,-0.5115
11,currency,112,3891.0,0.1344,0.1201,0.3212,0.2116,0.0149,-0.1458,0.0076,-0.2453
26,token,58,4547.4,0.1093,0.095,0.2863,0.1766,0.0171,-0.2883,0.01,-0.3311
27,fiat,34,4569.4,0.1086,0.0943,0.395,0.2854,0.0132,-0.012,0.0049,-0.1668
2,crypto,132,3671.7,0.1075,0.0932,0.3297,0.2201,0.0164,-0.1436,0.0088,-0.1311
21,wallet,87,4397.0,0.1042,0.0899,0.1719,0.0622,-0.004,-0.0685,-0.0071,-0.1228
7,market,254,2337.3,0.0766,0.0623,0.281,0.1714,0.0113,-0.0576,0.0061,-0.0387
22,exchange,149,3672.8,0.075,0.0607,0.193,0.0834,0.0103,-0.0644,0.0037,-0.0903



--- Example Posts with 'bitcoin' ---


Unnamed: 0,_id,owner,text,timestamp_utc,return_30m,return_1h,return_2h,return_4h,sentiment_vader,avg_comment_sentiment_30m,avg_comment_sentiment_60m,avg_comment_sentiment_120m,avg_comment_sentiment_240m
6186,112601639679885936,107780257626128496,VOTE FOR TRUMP! Bitcoin mining may be our last...,2024-06-12 03:57:47.967000+00:00,0.002005,0.00027,-0.002919,0.000254,0.7097,,,,
5415,112859397412472624,107780257626128496,On my way to The Bitcoin Conference in Nashvil...,2024-07-27 16:28:58.837000+00:00,-0.00262,-0.005385,0.000229,-0.026497,0.296,,,,
5414,112860519107160432,107780257626128496,RT @realamericasvoicePresident @realDonaldTrum...,2024-07-27 21:14:14.540000+00:00,0.005692,0.001693,-0.004139,-0.006117,0.4404,,,,


Unnamed: 0,_id,owner,reply_to,text,timestamp_utc,trump_post_id,sentiment_vader
