<a href="https://colab.research.google.com/github/wiktorialasek/Thesis/blob/etap1/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import ast
import pandas as pd
import yfinance as yf
from datetime import timedelta

In [23]:
# Ścieżka do Twojego pliku
INPUT_PATH = '/content/drive/MyDrive/Colab Notebooks/filtered_elonmusk_tweets_with_tickers (2).csv'
# Ile wierszy chcemy przetworzyć
NROWS = 15
# Lista tickerów, które yfinance obsługuje na 100%
AVAILABLE = {"TSLA","AAPL","MSFT","AMZN","GOOGL","META","BTC-USD","ETH-USD","DOGE-USD"}

# === Cell 3: Wczytanie & wstępna obróbka ===
df = pd.read_csv(INPUT_PATH, nrows=NROWS)
# Parsowanie kolumny z tickerami
df['Found_Tickers'] = df['Found_Tickers'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else (x if isinstance(x, list) else [])
)
# datetime + data
df['createdAt'] = pd.to_datetime(df['createdAt'])
df['date']      = df['createdAt'].dt.date

# Odfiltruj tweety, które mają choć jeden ticker w AVAILABLE
df = df[df['Found_Tickers'].apply(lambda lst: any(t in AVAILABLE for t in lst))].reset_index(drop=True)

# === Cell 4: Pobierz historie dla unikalnych tickerów ===
# Wyznacz zakres dat
min_date = df['date'].min()
max_date = df['date'].max() + timedelta(days=1)

# Pobieramy raz pełne serie dla każdego tickera
histories = {}
for ticker in sorted({t for lst in df['Found_Tickers'] for t in lst if t in AVAILABLE}):
    histories[ticker] = yf.download(
        ticker,
        start=str(min_date),
        end=str(max_date),
        interval='1d',
        progress=False
    )

# === Cell 5: Funkcja wydobywająca OHLCV z pobranych danych ===
def get_daily_ohlc(ticker, date):
    hist = histories.get(ticker)
    if hist is None or hist.empty:
        return (None,)*5
    try:
        row = hist.loc[str(date)]
        return (row['Low'], row['High'], row['Open'], row['Close'], row['Volume'])
    except KeyError:
        return (None,)*5

# === Cell 6: Dodajemy kolumny daily_* ===
daily_cols = ['daily_low','daily_high','daily_open','daily_close','daily_volume']
for c in daily_cols:
    df[c] = None  # inicjalizacja

for idx, row in df.iterrows():
    # bierzemy tylko pierwszy ticker dla uproszczenia
    tick = next((t for t in row['Found_Tickers'] if t in AVAILABLE), None)
    low, high, op, cl, vol = get_daily_ohlc(tick, row['date'])
    df.at[idx, 'daily_low']    = low
    df.at[idx, 'daily_high']   = high
    df.at[idx, 'daily_open']   = op
    df.at[idx, 'daily_close']  = cl
    df.at[idx, 'daily_volume'] = vol

# === Cell 7: Zapis & podgląd ===
OUTPUT_PATH = '/content/filtered_tweets_with_daily_ohlc.csv'
cols_out = ['id','fullText','createdAt','Found_Tickers','date'] + daily_cols

df.to_csv(OUTPUT_PATH, index=False)
print(f"Zapisano wzbogacony plik: {OUTPUT_PATH}")
df[cols_out].head()


Zapisano wzbogacony plik: /content/filtered_tweets_with_daily_ohlc.csv


  histories[ticker] = yf.download(


Unnamed: 0,id,fullText,createdAt,Found_Tickers,date,daily_low,daily_high,daily_open,daily_close,daily_volume
0,1602885009647366144,RT @Tesla: Holiday Update rolling out now 🎅,2022-12-14 04:35:41+00:00,[TSLA],2022-12-14,Ticker TSLA 155.309998 Name: 2022-12-14 00:...,Ticker TSLA 161.619995 Name: 2022-12-14 00:...,Ticker TSLA 159.25 Name: 2022-12-14 00:00:0...,Ticker TSLA 156.800003 Name: 2022-12-14 00:...,Ticker TSLA 140682300.0 Name: 2022-12-14 00...
1,1444361244996259843,♥️♥️ Congrats Tesla team! ♥️♥️,2021-10-02 17:58:50+00:00,[TSLA],2021-10-02,,,,,
2,918516707735437312,"@Barnacules To be more inclusive, how about a ...",2017-10-12 16:40:22+00:00,[TSLA],2017-10-12,Ticker TSLA 23.509333 Name: 2017-10-12 00:0...,Ticker TSLA 23.985332 Name: 2017-10-12 00:0...,Ticker TSLA 23.530001 Name: 2017-10-12 00:0...,Ticker TSLA 23.712 Name: 2017-10-12 00:00:0...,Ticker TSLA 61305000.0 Name: 2017-10-12 00:...
3,916395155120205824,Tesla Semi unveil now Nov 16. Diverting resour...,2017-10-06 20:10:05+00:00,[TSLA],2017-10-06,Ticker TSLA 23.483334 Name: 2017-10-06 00:0...,Ticker TSLA 24.006666 Name: 2017-10-06 00:0...,Ticker TSLA 23.540001 Name: 2017-10-06 00:0...,Ticker TSLA 23.792 Name: 2017-10-06 00:00:0...,Ticker TSLA 64462500.0 Name: 2017-10-06 00:...
4,916376799466024832,@MacTechGenius @Jon4Lakers @solarcity @Tesla N...,2017-10-06 18:57:09+00:00,[TSLA],2017-10-06,Ticker TSLA 23.483334 Name: 2017-10-06 00:0...,Ticker TSLA 24.006666 Name: 2017-10-06 00:0...,Ticker TSLA 23.540001 Name: 2017-10-06 00:0...,Ticker TSLA 23.792 Name: 2017-10-06 00:00:0...,Ticker TSLA 64462500.0 Name: 2017-10-06 00:...


In [24]:
import re

# zakładamy, że masz już swój DataFrame w zmiennej df
cols = ['daily_low','daily_high','daily_open','daily_close','daily_volume']

for col in cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.extract(r'\s+(\d+\.?\d*)\s+(?=Name:)')[0]  # grupa z liczbą
        .astype(float)                                 # konwertujemy na float
    )

print(df[cols].head(10))
cols_out = ['id','fullText','createdAt','Found_Tickers','date'] + cols
# podgląd


    daily_low  daily_high  daily_open  daily_close  daily_volume
0  155.309998  161.619995  159.250000   156.800003   140682300.0
1         NaN         NaN         NaN          NaN           NaN
2   23.509333   23.985332   23.530001    23.712000    61305000.0
3   23.483334   24.006666   23.540001    23.792000    64462500.0
4   23.483334   24.006666   23.540001    23.792000    64462500.0
5   23.483334   24.006666   23.540001    23.792000    64462500.0
6   23.972668   24.538000   24.254667    24.415333    62778000.0
7         NaN         NaN         NaN          NaN           NaN
8   21.752666   23.166668   23.066668    22.297333   124536000.0
9   23.653334   25.124666   24.961332    23.821333   258921000.0


In [25]:
df[cols_out].head(20)



Unnamed: 0,id,fullText,createdAt,Found_Tickers,date,daily_low,daily_high,daily_open,daily_close,daily_volume
0,1602885009647366144,RT @Tesla: Holiday Update rolling out now 🎅,2022-12-14 04:35:41+00:00,[TSLA],2022-12-14,155.309998,161.619995,159.25,156.800003,140682300.0
1,1444361244996259843,♥️♥️ Congrats Tesla team! ♥️♥️,2021-10-02 17:58:50+00:00,[TSLA],2021-10-02,,,,,
2,918516707735437312,"@Barnacules To be more inclusive, how about a ...",2017-10-12 16:40:22+00:00,[TSLA],2017-10-12,23.509333,23.985332,23.530001,23.712,61305000.0
3,916395155120205824,Tesla Semi unveil now Nov 16. Diverting resour...,2017-10-06 20:10:05+00:00,[TSLA],2017-10-06,23.483334,24.006666,23.540001,23.792,64462500.0
4,916376799466024832,@MacTechGenius @Jon4Lakers @solarcity @Tesla N...,2017-10-06 18:57:09+00:00,[TSLA],2017-10-06,23.483334,24.006666,23.540001,23.792,64462500.0
5,916336207935635456,@ismailnathij @stapf @Tesla The internal Tesla...,2017-10-06 16:15:51+00:00,[TSLA],2017-10-06,23.483334,24.006666,23.540001,23.792,64462500.0
6,908108029777686400,Tesla Semi truck unveil &amp; test ride tentat...,2017-09-13 23:20:00+00:00,[TSLA],2017-09-13,23.972668,24.538,24.254667,24.415333,62778000.0
7,901238166237069312,"@Techmeme Tesla does not really have ""TTunes""....",2017-08-26 00:21:37+00:00,[TSLA],2017-08-26,,,,,
8,890676198874824704,@mathetes76 Tesla team will verify,2017-07-27 20:52:08+00:00,[TSLA],2017-07-27,21.752666,23.166668,23.066668,22.297333,124536000.0
9,873310575073255424,@samabuelsamid @TeslaMotors I've written two f...,2017-06-09 22:47:21+00:00,[TSLA],2017-06-09,23.653334,25.124666,24.961332,23.821333,258921000.0


In [26]:
# prompt: niech obliczy % wzrostu i go doda do kolejnej kolumny. procent ma byc oblicony miedzy open a close, ale zeby tylko sie ta liczba tam wyswietlala bez zadnego innego zbednego tesktu

df['daily_pct_change'] = ((df['daily_close'] - df['daily_open']) / df['daily_open']) * 100
df['daily_pct_change'] = df['daily_pct_change'].round(2)

# === Cell 7: Zapis & podgląd ===
OUTPUT_PATH = '/content/filtered_tweets_with_daily_ohlc.csv'
cols_out = ['id','fullText','createdAt','Found_Tickers','date'] + daily_cols + ['daily_pct_change']

df.to_csv(OUTPUT_PATH, index=False)
print(f"Zapisano wzbogacony plik: {OUTPUT_PATH}")
df[cols_out].head()

Zapisano wzbogacony plik: /content/filtered_tweets_with_daily_ohlc.csv


Unnamed: 0,id,fullText,createdAt,Found_Tickers,date,daily_low,daily_high,daily_open,daily_close,daily_volume,daily_pct_change
0,1602885009647366144,RT @Tesla: Holiday Update rolling out now 🎅,2022-12-14 04:35:41+00:00,[TSLA],2022-12-14,155.309998,161.619995,159.25,156.800003,140682300.0,-1.54
1,1444361244996259843,♥️♥️ Congrats Tesla team! ♥️♥️,2021-10-02 17:58:50+00:00,[TSLA],2021-10-02,,,,,,
2,918516707735437312,"@Barnacules To be more inclusive, how about a ...",2017-10-12 16:40:22+00:00,[TSLA],2017-10-12,23.509333,23.985332,23.530001,23.712,61305000.0,0.77
3,916395155120205824,Tesla Semi unveil now Nov 16. Diverting resour...,2017-10-06 20:10:05+00:00,[TSLA],2017-10-06,23.483334,24.006666,23.540001,23.792,64462500.0,1.07
4,916376799466024832,@MacTechGenius @Jon4Lakers @solarcity @Tesla N...,2017-10-06 18:57:09+00:00,[TSLA],2017-10-06,23.483334,24.006666,23.540001,23.792,64462500.0,1.07
