In [34]:
import pandas as pd

tweets = pd.read_csv("../Data/tweets_sentiment.csv")
financial_data = pd.read_csv("../Data/AAPL_2015_2020.csv")

## Preprocesado de Tweets

In [35]:
# Asegúrate de que las fechas están en formato datetime
tweets["post_date"] = pd.to_datetime(tweets["post_date"])

# Filtrar las fechas a partir del 16 de octubre de 2015
filtered_tweets = tweets[tweets["post_date"] >= "2015-10-19"]

# Mapear etiquetas de sentimiento a valores numéricos
sentiment_mapping = {"Neutral": 0, "Positive": 1, "Negative": -1}
filtered_tweets["sentiment_value"] = filtered_tweets["sentiment"].map(sentiment_mapping)

# Agrupar por fecha y calcular métricas
daily_sentiment = filtered_tweets.groupby("post_date").agg(
    average_sentiment=("sentiment_value", "mean"),  # Promedio del sentimiento diario
    weighted_sentiment=("sentiment_value", lambda x: (x * filtered_tweets.loc[x.index, "sentiment_score"]).sum() / filtered_tweets.loc[x.index, "sentiment_score"].sum() if filtered_tweets.loc[x.index, "sentiment_score"].sum() > 0 else 0),  # Promedio ponderado
    dominant_sentiment=("sentiment", lambda x: x.mode()[0] if not x.empty else "Neutral"),  # Sentimiento dominante
    average_score=("sentiment_score", "mean"),  # Promedio del puntaje de predicción
    tweet_count=("sentiment", "count")  # Total de tweets por día
).reset_index()

# Mostrar resultados
daily_sentiment

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets["sentiment_value"] = filtered_tweets["sentiment"].map(sentiment_mapping)


Unnamed: 0,post_date,average_sentiment,weighted_sentiment,dominant_sentiment,average_score,tweet_count
0,2015-10-19,-0.500000,-0.506856,Negative,0.980007,2
1,2015-10-20,0.000000,-0.129423,Negative,0.878878,3
2,2015-10-23,0.333333,0.228007,Positive,0.861233,3
3,2015-10-24,0.000000,0.000000,Neutral,0.646064,1
4,2015-10-25,0.000000,-0.109117,Neutral,0.796431,5
...,...,...,...,...,...,...
1375,2019-12-27,-0.166667,-0.174565,Negative,0.764248,6
1376,2019-12-28,-0.142857,-0.074912,Negative,0.864262,7
1377,2019-12-29,0.285714,0.453300,Neutral,0.795621,7
1378,2019-12-30,-0.038462,-0.048378,Negative,0.806989,26


## Unión de Datos

In [37]:
# Dataset financiero
financial_data["Date"] = pd.to_datetime(financial_data["Date"])

# Unir los datasets por fecha
financial_data = financial_data.merge(
    daily_sentiment,
    left_on="Date", right_on="post_date",
    how="left"
)

# Rellenar valores faltantes
financial_data.fillna({
    "average_sentiment": 0,
    "weighted_sentiment": 0,
    "tweet_count": 0,
    "sentiment_volatility": 0,
    "dominant_sentiment": "Neutral",
    "average_score": 0
}, inplace=True)

# Calcular el momentum del sentimiento
financial_data["sentiment_momentum"] = financial_data["average_sentiment"].diff()

In [40]:
data = financial_data.drop(columns=["post_date"])
data_filt = data.dropna()

# Convertir la columna 'Date' a tipo de dato datetime (si no lo está ya)
data_filt['Date'] = pd.to_datetime(data_filt['Date'])

# Establecer la columna 'Date' como índice
data_filt.set_index('Date', inplace=True)

# Verificar los primeros registros
data_filt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filt['Date'] = pd.to_datetime(data_filt['Date'])


Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_50,SMA_200,RSI,MACD,MACD_signal,average_sentiment,weighted_sentiment,dominant_sentiment,average_score,tweet_count,sentiment_momentum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-10-19,0.090084,0.106323,0.099259,0.103901,0.150787,0.094038,0.164990,0.407746,0.534519,0.498572,-0.500000,-0.506856,Negative,0.980007,2.0,-0.500000
2015-10-20,0.099293,0.116463,0.111221,0.107475,0.308225,0.093352,0.165224,0.436712,0.538969,0.501572,0.000000,-0.129423,Negative,0.878878,3.0,0.500000
2015-10-21,0.099248,0.116413,0.118191,0.121973,0.249437,0.093397,0.165617,0.539981,0.554379,0.507335,0.000000,0.000000,Neutral,0.000000,0.0,0.000000
2015-10-22,0.107102,0.125062,0.117795,0.123987,0.248280,0.093934,0.165937,0.552829,0.568103,0.514942,0.000000,0.000000,Neutral,0.000000,0.0,0.000000
2015-10-23,0.123263,0.142857,0.136233,0.135213,0.393458,0.094149,0.166267,0.619516,0.587800,0.525326,0.333333,0.228007,Positive,0.861233,3.0,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,0.963517,0.962621,0.951953,0.960987,0.108854,0.975406,0.982067,0.759345,0.889810,0.859900,0.700000,0.683708,Positive,0.869511,10.0,0.245455
2019-12-24,0.964827,0.963963,0.955116,0.973823,0.006210,0.981204,0.986548,0.789987,0.901209,0.869707,0.200000,0.140574,Positive,0.869631,10.0,-0.500000
2019-12-26,0.992189,0.991997,0.980277,0.982784,0.097685,0.987190,0.991040,0.809856,0.913457,0.880227,0.000000,-0.032318,Negative,0.831924,18.0,-0.200000
2019-12-27,0.991655,0.991450,1.000000,1.000000,0.206581,0.993788,0.995608,0.844339,0.932905,0.892887,-0.166667,-0.174565,Negative,0.764248,6.0,-0.166667


In [39]:
data_filt.to_csv("../Data/Global_Data.csv")

In [1]:
from datasets import load_dataset

dataset = load_dataset("monash_tsf", "tourism_monthly")

README.md:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

monash_tsf.py:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

utils.py:   0%|          | 0.00/7.54k [00:00<?, ?B/s]

ValueError: The repository for monash_tsf contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/monash_tsf.
Please pass the argument `trust_remote_code=True` to allow custom code to be run.