In [12]:
import pandas as pd
from scipy.stats import ttest_ind
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [13]:
df = pd.read_csv("/Users/merterol/Desktop/UZH/CompLing:CompSci/CompSci/Sem2/ESC403/ESC403_Project/DataScienceProject/visuals/CertifiedNews_df_merged.csv")

Event Ranges with margins before and after

In [16]:
df["Date"] = pd.to_datetime(df["Date"])

events = {
    "COVID-19 Pandemic": ("2020-03-01", "2021-03-01"),
    "Ukraine Conflict": ("2022-01-15", "2023-09-01"),
    "Biden Election Period": ("2020-11-01", "2022-01-01"),
    "Trump Election Period": ("2016-11-01", "2020-12-31"),
    "GTA 5 Release": ("2013-09-01", "2013-10-30"),
    "Snowden Leak": ("2013-06-01", "2013-07-01")
}


for event, (start, end) in events.items():
    start, end = pd.to_datetime(start), pd.to_datetime(end)
    df[event] = df["Date"].apply(lambda x: start <= x <= end)

Getting Sentiment Data

In [17]:
event_sentiment_summary = {}
for event in events:
    event_data = df[df[event]]
    non_event_data = df[~df[event]]
    event_avg = event_data["Polarity"].mean()
    non_event_avg = non_event_data["Polarity"].mean()
    event_sentiment_summary[event] = (event_avg, non_event_avg)

# Display the results
for event, (event_avg, non_event_avg) in event_sentiment_summary.items():
    print(f"{event}: During Event Avg Polarity = {event_avg:.3f}, Outside Event Avg Polarity = {non_event_avg:.3f}")


COVID-19 Pandemic: During Event Avg Polarity = 0.209, Outside Event Avg Polarity = 0.136
Ukraine Conflict: During Event Avg Polarity = 0.164, Outside Event Avg Polarity = 0.154
Biden Election Period: During Event Avg Polarity = 0.155, Outside Event Avg Polarity = 0.157
Trump Election Period: During Event Avg Polarity = 0.188, Outside Event Avg Polarity = 0.147
GTA 5 Release: During Event Avg Polarity = nan, Outside Event Avg Polarity = 0.155
Snowden Leak: During Event Avg Polarity = nan, Outside Event Avg Polarity = 0.155


#### Visualizing Sentiment Data

In [18]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import Span, Label, Legend, LegendItem
import pandas as pd

output_notebook()

In [19]:
events = {
    "COVID-19 Pandemic": ("2020-03-01", "2021-03-01", "#1f77b4", "#aec7e8"),
    "Ukraine Conflict": ("2022-01-15", "2023-09-01", "#ff7f0e", "#ffbb78"),
    "Biden Election Period": ("2020-11-01", "2022-01-01", "#2ca02c", "#98df8a"),
    "Trump Election Period": ("2016-11-01", "2020-12-31", "#d62728", "#ff9896"),
    "GTA 5 Release": ("2013-09-01", "2013-10-30", "#9467bd", "#c5b0d5"),
    "Snowden Leak": ("2013-06-01", "2013-07-01", "#8c564b", "#c49c94")
}

for event in events:
    start_date, end_date, start_color, end_color = events[event]
    events[event] = (pd.to_datetime(start_date), pd.to_datetime(end_date), start_color, end_color)


In [22]:
# Create a Bokeh plot
p = figure(title="Sentiment Polarity Over Time with Event Markers",
            x_axis_label="Date", y_axis_label="Polarity",
            x_axis_type="datetime", width=1400, height=600,
            tools="pan,wheel_zoom,box_zoom,reset,save")

# Add a line renderer for daily polarity
p.line(df["Date"], df["Polarity"], legend_label="Daily Polarity", line_width=2, color="blue", alpha=0.5)

# Prepare for legend items
legend_items = []

# Process each event
for event, (start_date, end_date, start_color, end_color) in events.items():
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # Add start and end lines using thin lines
    p.line([start_date, start_date], [min(df["Polarity"]), max(df["Polarity"])], 
            line_width=2, color=start_color, legend_label=f"{event} Start")
    p.line([end_date, end_date], [min(df["Polarity"]), max(df["Polarity"])], 
            line_width=2, color=end_color, legend_label=f"{event} End")

# Show the plot
show(p)

## ARIMA

In [23]:
df = pd.read_csv("/Users/merterol/Desktop/UZH/CompLing:CompSci/CompSci/Sem2/ESC403/ESC403_Project/DataScienceProject/visuals/df_csv/CertifiedNews_df_merged.csv")
df["Date"] = pd.to_datetime(df["Date"])
df.set_index("Date", inplace=True)

# Assuming "Polarity" is the column with sentiment scores
ts = df["Polarity"]