In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

ticker = "GOOGL"
tweets = pd.read_csv(f"../Data/tweets_sentiment_{ticker}.csv")
financial_data = pd.read_csv(f"../Data/{ticker}.csv")

## Preprocesado de Tweets

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

# Asegurar que las fechas están en formato datetime
tweets["post_date"] = pd.to_datetime(tweets["post_date"])

# Filtrar las fechas a partir del 16 de octubre de 2015
filtered_tweets = tweets[tweets["post_date"] >= "2015-10-16"].copy()

# Mapear etiquetas de sentimiento a valores numéricos
sentiment_mapping = {"Neutral": 0, "Positive": 1, "Negative": -1}
filtered_tweets["sentiment_value"] = filtered_tweets["sentiment"].map(sentiment_mapping)

# Agrupar por fecha y calcular métricas
daily_sentiment = filtered_tweets.groupby("post_date").agg(
    avg_sentiment=("sentiment_value", "mean"),
    
    # Promedio de sentimiento ponderado por confidence
    weighted_sentiment=(
        "sentiment_value",
        lambda x: (x * filtered_tweets.loc[x.index, "sentiment_score"]).sum() /
                  (filtered_tweets.loc[x.index, "sentiment_score"].sum() + 1e-9)  # Evita división por cero
    ),
    
    # Número de tweets por día
    tweet_count=("sentiment_value", "size"),
).reset_index()

# Calcular media móvil del sentimiento (7 y 14 días)
daily_sentiment["sentiment_sma_7"] = daily_sentiment["weighted_sentiment"].rolling(window=7).mean()
daily_sentiment["sentiment_sma_14"] = daily_sentiment["weighted_sentiment"].rolling(window=14).mean()

# Reemplazar valores infinitos generados por divisiones por cero
daily_sentiment.replace([np.inf, -np.inf], 0, inplace=True)

# Rellenar valores NaN generados por diferencias y medias móviles
daily_sentiment.fillna(0, inplace=True)

# Normalización con StandardScaler
sentiment_features = [
    'avg_sentiment', 'weighted_sentiment', 'sentiment_sma_7', 'sentiment_sma_14', 'tweet_count'
]

scaler_standard = StandardScaler()
daily_sentiment[sentiment_features] = scaler_standard.fit_transform(daily_sentiment[sentiment_features])

# Mostrar las primeras filas para verificar
daily_sentiment


Unnamed: 0,post_date,avg_sentiment,weighted_sentiment,tweet_count,sentiment_sma_7,sentiment_sma_14
0,2015-10-19,-0.072735,-0.057283,0.471818,-0.116174,-0.137780
1,2015-10-20,1.505291,1.505101,-0.603545,-0.116174,-0.137780
2,2015-10-21,1.505291,1.505101,-0.603545,-0.116174,-0.137780
3,2015-10-22,1.505291,1.505101,-0.245091,-0.116174,-0.137780
4,2015-10-23,1.505291,1.505101,0.113363,-0.116174,-0.137780
...,...,...,...,...,...,...
1010,2019-12-27,1.189686,1.123136,0.830272,1.250177,1.971825
1011,2019-12-28,1.505291,1.505101,-0.603545,1.624119,2.110364
1012,2019-12-29,0.979282,0.833694,0.113363,1.406798,2.222966
1013,2019-12-30,-0.072735,-0.137553,0.113363,0.875104,1.881654


## Unión de Datos

In [23]:
# Dataset financiero
financial_data["Date"] = pd.to_datetime(financial_data["Date"])

# Unir los datasets por fecha
financial_data = financial_data.merge(
    daily_sentiment,
    left_on="Date", right_on="post_date",
    how="left"
)


In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Eliminar la columna "post_date" y manejar valores nulos
data = financial_data.drop(columns=["post_date"]).copy()
data_filt = data.dropna().copy()

# Convertir la columna 'Date' a datetime y establecer como índice
data_filt.loc[:, 'Date'] = pd.to_datetime(data_filt['Date'])
data_filt.set_index('Date', inplace=True)

# Definir las características financieras
financial_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_50', 'SMA_200', 'RSI', 'MACD', 'MACD_signal']

# Normalización con StandardScaler
scaler = StandardScaler()
data_filt[financial_features] = scaler.fit_transform(data_filt[financial_features])

data_filt


Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_50,SMA_200,RSI,MACD,MACD_signal,avg_sentiment,weighted_sentiment,tweet_count,sentiment_sma_7,sentiment_sma_14
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-10-18,-2.045102,-2.050080,-2.045353,-2.105981,0.580916,-2.139009,-2.098221,0.382844,-0.110549,-0.109188,-0.072735,-0.057258,-0.603545,-1.222610,-1.261490
2016-10-19,-2.004292,-2.042568,-1.971962,-2.039654,-0.347319,-2.136098,-2.095983,0.886762,-0.042613,-0.095214,1.505291,1.505101,-0.603545,-0.716906,-1.261490
2016-10-20,-2.044034,-2.052660,-1.989868,-2.003852,-0.472228,-2.132852,-2.093045,1.119248,0.033641,-0.067537,0.716278,0.606170,-0.245091,-0.324632,-1.123642
2016-10-24,-1.936402,-1.980800,-1.924623,-1.984497,-0.409068,-2.127387,-2.087114,0.929473,0.108871,-0.009208,1.189686,1.160207,0.830272,1.335268,0.103211
2016-10-25,-1.991247,-1.976551,-1.953365,-1.920159,0.112094,-2.122434,-2.083057,1.274355,0.200267,0.037320,-0.072735,-0.057258,-0.603545,1.504042,0.192034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-20,1.995747,2.011033,2.084505,2.093099,0.834029,1.771276,1.331527,0.997964,0.993137,0.984304,-0.861747,-0.932837,-0.245091,0.524351,1.874969
2019-12-26,2.081563,2.004964,2.044390,1.966489,-0.720165,1.833964,1.351142,0.209919,0.786306,0.964794,0.979282,1.004766,0.113363,1.373811,1.876757
2019-12-27,2.021836,2.034178,2.101950,2.099984,-0.746629,1.853304,1.357423,0.739912,0.794207,0.945028,1.189686,1.123136,0.830272,1.250177,1.971825
2019-12-30,1.907947,1.957916,1.985447,2.044980,-0.935875,1.869988,1.363468,0.409208,0.746764,0.918950,-0.072735,-0.137553,0.113363,0.875104,1.881654


In [25]:
data_filt.to_csv(f"../Data/Global_Data_{ticker}.csv")