# Construcción del Dataset Global

## Carga de Datos

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [2]:
ticker = "AAPL"
tweets = pd.read_csv(f"../Data/tweets_sentiment_{ticker}.csv")
financial_data = pd.read_csv(f"../Data/{ticker}.csv")

## Preprocesado de Tweets

In [4]:
# Convertir fechas a formato datetime
tweets["post_date"] = pd.to_datetime(tweets["post_date"])

# Filtrar tweets a partir del 16 de octubre de 2015
filtered_tweets = tweets[tweets["post_date"] >= "2015-10-16"].copy()

# Mapear etiquetas de sentimiento a valores numéricos
sentiment_mapping = {"Neutral": 0, "Positive": 1, "Negative": -1}
filtered_tweets["sentiment_value"] = filtered_tweets["sentiment"].map(sentiment_mapping)

# Agrupar por fecha y calcular métricas de sentimiento
daily_sentiment = filtered_tweets.groupby("post_date").agg(
    avg_sentiment=("sentiment_value", "mean"),  # Sentimiento promedio diario
    
    # Sentimiento ponderado por confianza
    weighted_sentiment=(
        "sentiment_value",
        lambda x: (x * filtered_tweets.loc[x.index, "sentiment_score"]).sum() /
                  (filtered_tweets.loc[x.index, "sentiment_score"].sum() + 1e-9)  # Evita división por cero
    ),
    
    tweet_count=("sentiment_value", "size"),  # Número de tweets por día
).reset_index()

# Calcular medias móviles del sentimiento (7 y 14 días)
daily_sentiment["sentiment_sma_7"] = daily_sentiment["weighted_sentiment"].rolling(window=7).mean()
daily_sentiment["sentiment_sma_14"] = daily_sentiment["weighted_sentiment"].rolling(window=14).mean()

# Manejo de valores infinitos y NaN
daily_sentiment.replace([np.inf, -np.inf], 0, inplace=True)
daily_sentiment.fillna(0, inplace=True)

# Normalización de características de sentimiento con StandardScaler
sentiment_features = ['avg_sentiment', 'weighted_sentiment', 'sentiment_sma_7', 'sentiment_sma_14', 'tweet_count']
scaler_standard = StandardScaler()
daily_sentiment[sentiment_features] = scaler_standard.fit_transform(daily_sentiment[sentiment_features])

daily_sentiment.head()


Unnamed: 0,post_date,avg_sentiment,weighted_sentiment,tweet_count,sentiment_sma_7,sentiment_sma_14
0,2015-10-17,1.497059,1.500741,-0.456950,0.039209,0.060053
1,2015-10-20,-0.752563,-0.725875,-0.190542,0.039209,0.060053
2,2015-10-23,-1.502437,-1.467483,-0.456950,0.039209,0.060053
3,2015-10-25,-0.002689,-0.242086,0.075867,0.039209,0.060053
4,2015-10-27,0.497227,0.545833,0.875092,0.039209,0.060053
...,...,...,...,...,...,...
908,2019-12-26,-0.302639,-0.439017,0.608684,-0.016292,0.222626
909,2019-12-27,-1.502437,-1.467483,-0.456950,-0.882649,-0.048287
910,2019-12-28,-0.752563,-0.640745,-0.190542,-0.580844,-0.363886
911,2019-12-30,0.497227,0.551162,0.075867,-0.828624,-0.612379


## Unión de Datos

In [None]:
# Convertir fechas en el dataset financiero
financial_data["Date"] = pd.to_datetime(financial_data["Date"])

# Unir los datos financieros con los datos de sentimiento por fecha
financial_data = financial_data.merge(
    daily_sentiment,
    left_on="Date", right_on="post_date",
    how="left"
)

## Procesamiento final y normalización

In [None]:
# Eliminar la columna de fecha duplicada y manejar valores nulos
data = financial_data.drop(columns=["post_date"]).copy()
data_filt = data.dropna().copy()

# Convertir la columna 'Date' a índice de tiempo
data_filt["Date"] = pd.to_datetime(data_filt["Date"])
data_filt.set_index("Date", inplace=True)

# Definir las características financieras a normalizar
financial_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_50', 'SMA_200', 'RSI', 'MACD', 'MACD_signal']

# Normalizar características financieras con StandardScaler
scaler = StandardScaler()
data_filt[financial_features] = scaler.fit_transform(data_filt[financial_features])

data_filt


Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_50,SMA_200,RSI,MACD,MACD_signal,avg_sentiment,weighted_sentiment,tweet_count,sentiment_sma_7,sentiment_sma_14
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-10-19,-1.923882,-1.940571,-1.987976,-1.923895,-0.843739,-2.227854,-2.383641,0.439562,0.181474,0.174251,-1.502437,-1.467483,-0.456950,-0.260961,0.630972
2016-10-20,-1.925562,-1.951164,-1.916395,-1.934864,-0.541287,-2.222643,-2.380764,0.292853,0.155432,0.174354,-1.502437,-1.467483,-0.456950,-0.802744,-0.145774
2016-10-24,-1.909044,-1.941128,-1.897439,-1.928114,-0.584689,-2.211383,-2.374290,0.349188,0.107621,0.159935,1.497059,1.500741,-0.190542,0.039209,0.621624
2016-10-25,-1.892245,-1.923845,-1.888668,-1.904206,1.233208,-2.205110,-2.370941,0.564752,0.103712,0.152062,0.997143,1.065735,0.075867,-0.119592,0.507788
2016-10-26,-1.966719,-1.997995,-2.001839,-2.006585,2.564286,-2.202117,-2.368512,-0.690204,0.020824,0.128393,-1.502437,-1.467483,0.075867,-0.119592,-0.268958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-24,2.755953,2.718324,2.796886,2.785545,-1.428867,2.529751,1.649004,1.309762,1.359274,1.235763,-0.502605,-0.894454,0.075867,0.889875,0.699442
2019-12-26,2.913861,2.860212,2.847247,2.789201,-0.603792,2.560719,1.667106,1.317796,1.399266,1.284244,-0.302639,-0.439017,0.608684,-0.016292,0.222626
2019-12-27,2.910781,2.971436,2.944008,2.966396,0.378422,2.597645,1.686029,1.659107,1.540049,1.352533,-1.502437,-1.467483,-0.456950,-0.882649,-0.048287
2019-12-30,2.958937,2.935755,2.861959,2.919706,0.338657,2.632411,1.704492,1.367024,1.594799,1.418639,0.497227,0.551162,0.075867,-0.828624,-0.612379


In [22]:
data_filt.to_csv(f"../Data/Global_Data_{ticker}.csv")