In [3]:
import pandas as pd
import re

# ---------- 1. Read files ----------
df1 = pd.read_csv("saylor_2025.csv")
df2 = pd.read_csv("musk_2025.csv")
df3 = pd.read_csv("btcnews_2025.csv")
df4 = pd.read_csv("documenting_bitcoin_2025.csv")
df5 = pd.read_csv("anthony_pompliano_2025.csv")

# ---------- 2. Define keywords ----------
keywords = re.compile(r"\b(bitcoin|crypto|cryptocurrency|btc|blockchain)\b", re.I)

# ---------- 3. Filter ONLY df1, df2 and df5 ----------
df1 = df1[df1["rawContent"].str.contains(keywords, na=False)]
df2 = df2[df2["rawContent"].str.contains(keywords, na=False)]
df5 = df5[df5["rawContent"].str.contains(keywords, na=False)]

# ---------- 4. Combine the five dataframes ----------
combined = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

# ---------- 5. Filter by date after 2025-06-04 ----------
combined["date"] = pd.to_datetime(combined["date"], utc=True, errors="coerce")
combined = combined[combined["date"] > "2025-06-04"]

# ---------- 6. Keep only date and text ----------
final_df = combined[["date", "rawContent"]].copy()

# ---------- 7. Save ----------
final_df.to_csv("dataset_test_2025.csv", index=False)
print(f"Archivo generado con {len(final_df)} tuits: dataset_test_2025.csv")


Archivo generado con 2076 tuits: dataset_test_2025.csv


  df1 = df1[df1["rawContent"].str.contains(keywords, na=False)]
  df2 = df2[df2["rawContent"].str.contains(keywords, na=False)]
  df5 = df5[df5["rawContent"].str.contains(keywords, na=False)]


In [4]:
# cantidad de tweets finales

print(len(final_df))


2076


In [2]:
import pandas as pd
import re

df6 = pd.read_csv("bitcoin_news_jun_jul_2025.csv")

# solo guardar texto y fecha
df6 = df6[["date", "rawContent"]]

keywords = re.compile(r"\b(bitcoin|crypto|cryptocurrency|btc|blockchain)\b", re.I)

df6 = df6[df6["rawContent"].str.contains(keywords, na=False)]

# guardar en csv
df6.to_csv("bitcoin_news_jun_jul_2025_clean.csv", index=False)

  df6 = df6[df6["rawContent"].str.contains(keywords, na=False)]


In [3]:
import pandas as pd
import requests, time
from datetime import timedelta


def get_bitcoin_price(timestamp):
    """
    Returns the opening price (BTC/USDT) of the 1-hour candle whose
    start is the first ≥ timestamp. If it fails, returns None.
    """
    ms = int(timestamp.timestamp() * 1000)        # Binance uses milliseconds
    url = "https://api.binance.com/api/v3/klines"
    params = {
        "symbol": "BTCUSDT",
        "interval": "1h",
        "startTime": ms,
        "endTime": ms + 3_600_000,                # +1 h
        "limit": 1
    }
    r = requests.get(url, params=params, timeout=10)
    if r.status_code == 200:
        data = r.json()
        if data:
            return float(data[0][1])              # precio de apertura
    return None


def compute_price_factor():
    in_file  = "bitcoin_news_jun_jul_2025_clean.csv"
    out_file = "dataset_test_factor2_2025.csv"
    
    print(f"Leyendo {in_file} …")
    df = pd.read_csv(in_file)
    
    # Normalizar nombres de columnas
    df.rename(columns={"date": "Fecha", "rawContent": "Texto"}, inplace=True)
    df["Fecha"] = pd.to_datetime(df["Fecha"], utc=True, errors="coerce")
    
    print(f"Procesando {len(df)} tuits …")
    rows = []
    
    for i, row in df.iterrows():
        ts = row["Fecha"]
        texto = row["Texto"]
        
        # Timestamps
        t_before = ts - timedelta(hours=1)
        t_after  = ts + timedelta(hours=2)
        
        price_before = get_bitcoin_price(t_before)
        price_after  = get_bitcoin_price(t_after)
        factor = (price_after / price_before) if price_before and price_after else None
        
        rows.append({
            "Publication_Date": ts,
            "Text": texto,
            "Multiplication_Factor": factor
        })
        
        time.sleep(0.1)   # evitar rate-limit
    
    result = pd.DataFrame(rows)
    result.to_csv(out_file, index=False, encoding="utf-8")
    
    print(f"\nSaved: {out_file}  —  {len(result)} rows")
    print(f"Tweets with calculated factor: {result['Factor_Multiplicacion'].notna().sum()}")
    
    return result

compute_price_factor()


Leyendo bitcoin_news_jun_jul_2025_clean.csv …
Procesando 529 tuits …

Guardado: dataset_test_factor2_2025.csv  —  529 filas
Tuits con factor calculado: 529


Unnamed: 0,Fecha_Publicacion,Texto,Factor_Multiplicacion
0,2025-07-02 16:34:16+00:00,“They’ll keep printing until the system implod...,1.007368
1,2025-07-03 21:10:18+00:00,NEW: 🇺🇸 Leaders from the House of Representati...,0.996705
2,2025-07-03 20:37:20+00:00,An unknown wallet recently sent $20K in BTC to...,0.999051
3,2025-07-03 19:30:30+00:00,RT @_Rob_Wallace: What Trump would look like t...,1.000996
4,2025-07-03 19:30:05+00:00,What Trump would look like trying to strong-ar...,1.000996
...,...,...,...
524,2025-06-02 20:36:35+00:00,JUST IN: 🚨 Strategy plans to launch an IPO for...,1.012160
525,2025-06-02 20:33:42+00:00,Why Bitcoin?\n\nThis is why. https://t.co/ULBl...,1.012160
526,2025-06-02 19:48:55+00:00,Gold has outperformed Bitcoin since the start ...,1.005562
527,2025-06-02 19:03:50+00:00,"JUST IN: Tether Group moves 14,000 BTC as part...",1.005562
