In [140]:
import pandas as pd

# Leemos los CSVs necesarios
ticker = "TSLA"
tweets = pd.read_csv(f"../Data/tweets_sentiment_{ticker}.csv")
financial_data = pd.read_csv(f"../Data/{ticker}.csv")

## Preprocesado de Tweets

In [141]:
# Filtramos las fechas a partir del 16 de octubre de 2015
filtered_tweets = tweets[tweets["post_date"] >= "2015-10-19"]

# Mapeamos las etiquetas de sentimiento a valores numéricos
sentiment_mapping = {"Neutral": 0, "Positive": 1, "Negative": -1}
filtered_tweets["sentiment_value"] = filtered_tweets["sentiment"].map(sentiment_mapping)

# Agrupamos por fecha y calculamos diferentes métricas de sentimiento
daily_sentiment = filtered_tweets.groupby("post_date").agg(
    average_sentiment=("sentiment_value", "mean"),  # Promedio del sentimiento diario
    weighted_sentiment=("sentiment_value", lambda x: (x * filtered_tweets.loc[x.index, "sentiment_score"]).sum() / filtered_tweets.loc[x.index, "sentiment_score"].sum() if filtered_tweets.loc[x.index, "sentiment_score"].sum() > 0 else 0),  # Promedio ponderado
    dominant_sentiment=("sentiment", lambda x: x.mode()[0] if not x.empty else "Neutral"),  # Sentimiento dominante
    average_score=("sentiment_score", "mean"),  # Promedio del puntaje de predicción
    tweet_count=("sentiment", "count")  # Total de tweets por día
).reset_index()

# Mostramos los resultados
daily_sentiment

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets["sentiment_value"] = filtered_tweets["sentiment"].map(sentiment_mapping)


Unnamed: 0,post_date,average_sentiment,weighted_sentiment,dominant_sentiment,average_score,tweet_count
0,2015-10-20,0.000000,-0.092719,Negative,0.896345,3
1,2015-10-21,-1.000000,-1.000000,Negative,0.995067,1
2,2015-10-24,0.000000,0.000000,Neutral,0.511483,1
3,2015-10-30,0.333333,0.334575,Positive,0.762805,3
4,2015-11-01,0.000000,0.000000,Neutral,0.939588,1
...,...,...,...,...,...,...
1283,2019-12-27,-0.238532,-0.291217,Negative,0.853422,109
1284,2019-12-28,0.000000,-0.076613,Negative,0.822680,47
1285,2019-12-29,-0.414634,-0.452602,Negative,0.853842,82
1286,2019-12-30,-0.510490,-0.569895,Negative,0.863197,143


## Unión de Datos

In [142]:
# Unimos los datasets por fecha
data = financial_data.merge(
    daily_sentiment,
    left_on="Date", right_on="post_date",
    how="left"
)

# Rellenamos los valores faltantes
data.fillna({
    "average_sentiment": 0,
    "weighted_sentiment": 0,
    "tweet_count": 0,
    "sentiment_volatility": 0,
    "dominant_sentiment": "Neutral",
    "average_score": 0
}, inplace=True)

# Calculamos el momentum del sentimiento
data["sentiment_momentum"] = data["average_sentiment"].diff()

In [143]:
# Establecemos la columna 'Date' como índice
data.set_index('Date', inplace=True)

# Eliminamos la columna post_date introducida por el Dataset de sentimiento
data.drop(columns='post_date', inplace=True)

# Eliminamos la primera fila para evitar el Nan en el sentiment momentum
data = data.dropna()
# Mostramos el Dataset
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_50,SMA_200,RSI,MACD,MACD_signal,average_sentiment,weighted_sentiment,dominant_sentiment,average_score,tweet_count,sentiment_momentum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-10-18,0.079890,0.079890,0.058824,0.065247,0.111045,0.102020,0.004886,0.307258,0.377428,0.368171,-1.000000,-1.000000,Negative,0.796722,2.0,-1.000000
2016-10-19,0.097591,0.097591,0.087517,0.084496,0.152703,0.098439,0.004010,0.408188,0.382467,0.365870,0.000000,-0.249294,Negative,0.684881,2.0,1.000000
2016-10-20,0.079890,0.079890,0.072911,0.080446,0.091733,0.094985,0.003254,0.391519,0.385362,0.364697,1.000000,1.000000,Positive,0.840087,1.0,1.000000
2016-10-21,0.083819,0.083819,0.067204,0.081890,0.024050,0.091729,0.002605,0.399131,0.388928,0.364583,0.000000,0.000000,Neutral,0.000000,0.0,-1.000000
2016-10-24,0.094416,0.094416,0.076702,0.093279,0.017954,0.088749,0.002185,0.458094,0.397300,0.366425,0.333333,0.328422,Neutral,0.767217,3.0,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,0.953486,0.953486,0.946923,0.934432,0.353841,0.842281,0.383346,0.946618,0.888926,0.802308,-0.168889,-0.210256,Negative,0.834740,225.0,-0.060556
2019-12-24,0.977418,0.977418,0.960731,0.945220,0.186505,0.863020,0.388602,0.952967,0.924306,0.838322,-0.267857,-0.331700,Negative,0.844791,112.0,-0.098968
2019-12-26,1.000000,1.000000,0.992697,1.000000,0.268480,0.884593,0.394338,0.981619,0.970343,0.877765,-0.158416,-0.213673,Negative,0.850076,101.0,0.109441
2019-12-27,0.997777,0.997777,1.000000,0.999038,0.246607,0.905786,0.399841,0.978268,1.000000,0.916169,-0.238532,-0.291217,Negative,0.853422,109.0,-0.080116


In [144]:
data.to_csv(f"../Data/Global_Data_{ticker}.csv")