In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("sentiment_practice_output.csv")
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df['sentiment_score'] = df['sentiment'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})
df['month'] = df['date'].dt.to_period('M')

# Monthly average sentiment
monthly_avg = df.groupby('month')['sentiment_score'].mean().reset_index()
monthly_avg['month'] = monthly_avg['month'].dt.to_timestamp()
monthly_avg['lag_1_sentiment'] = monthly_avg['sentiment_score'].shift(1)
monthly_avg.dropna(inplace=True)

# Monthly sentiment proportions
monthly_sentiment_counts = df.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
monthly_sentiment_counts['total'] = monthly_sentiment_counts.sum(axis=1)
monthly_sentiment_counts['positive_ratio'] = monthly_sentiment_counts.get('Positive', 0) / monthly_sentiment_counts['total']
monthly_sentiment_counts['negative_ratio'] = monthly_sentiment_counts.get('Negative', 0) / monthly_sentiment_counts['total']
monthly_sentiment_counts = monthly_sentiment_counts[['positive_ratio', 'negative_ratio']].reset_index()
monthly_sentiment_counts['month'] = monthly_sentiment_counts['month'].dt.to_timestamp()

# Merge with main table
monthly_df = pd.merge(monthly_avg, monthly_sentiment_counts, on='month', how='inner')
monthly_df['extreme_ratio'] = monthly_df['positive_ratio'] + monthly_df['negative_ratio']

# Add sentiment standard deviation
monthly_std = df.groupby('month')['sentiment_score'].std().reset_index(name='sentiment_std')
monthly_std['month'] = monthly_std['month'].dt.to_timestamp()
monthly_df = pd.merge(monthly_df, monthly_std, on='month', how='inner')

# Final features and target
X = monthly_df[['lag_1_sentiment', 'extreme_ratio', 'sentiment_std']]
y = monthly_df['sentiment_score']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
coefficients = dict(zip(X.columns, model.coef_))

print("Model Evaluation:")
print("RMSE:", rmse)
print("R²:", r2)
print("Coefficients:", coefficients)


Model Evaluation:
RMSE: 0.00910658176934058
R²: 0.9557954757698793
Coefficients: {'lag_1_sentiment': 0.004782872722235317, 'extreme_ratio': 2.6310870155717954, 'sentiment_std': -2.807715727848879}
