In [140]:
import pandas as pd
import numpy as np
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [141]:
data_sources = ["papers", "reddit", "news"]
similarity_types = ["1_gram", "2_gram", "3_gram"]

In [142]:
data_source_index = 1
similarity_type_index = 0

In [143]:
data_source = data_sources[data_source_index]
similarity_type = similarity_types[similarity_type_index]

In [144]:
df = pd.read_csv(
    os.path.join(data_source, f"monthly_similarities_mean_{similarity_type}.csv")
)
results_df = pd.read_csv(
    os.path.join(data_source, "significant_results_" + similarity_type + ".csv")
)

In [145]:
df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)

In [146]:
# using px, just show the "similarity" column as the y-axis and x as the "date" column
df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str))

px.line(
    df,
    x="date",
    y="similarity",
    title=f"{similarity_type} vs Date (Source: {data_source})",
    labels={"date": "Date", "similarity": "Similarity"},
).add_vline(
    x="2022-11-01",
    line_dash="dash",
    line_color="red",
).add_annotation(
    x="2022-11-01",
    y=-0.1,
    text="ChatGPT Release",
    showarrow=False,
    xanchor="center",
    yanchor="top",
    yref="paper",
).show()

In [147]:
# important_columns = results_df[results_df["term"].isin(["POST", "ONSET"])][
#     ["PC", "term", "statistic"]
# ]
# important_columns

important_columns = results_df[results_df["term"].isin(["POST"])][
    ["PC", "term", "statistic"]
]
important_columns

Unnamed: 0,PC,term,statistic
3,similarity,POST,-2.494344


In [148]:
# for each of the rows in the important columns, using the column, PC, plot that column in the df with respect to the date which can be constructed from the year and month that is in df. Also for each of the plots, highlight the axvline at the date of the introduction of the chatgpt which is on November 2022. do the plots using plotly express

for index, row in important_columns.iterrows():
    column = row["PC"]
    term = row["term"]
    statistic = row["statistic"]
    trend = "higher" if statistic > 0 else "lower"
    type_of_trend = "shock" if term == "ONSET" else "lasting"
    df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str))

    # make the df column with respect to date a bit more smooth using window average
    df[column] = df[column].rolling(window=3).mean()

    px.line(
        df,
        x="date",
        y=column,
        title=f"{similarity_type} similarity vs Date showing a {type_of_trend} {trend} trend (Source: {data_source}) ",
        labels={"date": "Date", column: f"{column}"},
    ).add_vline(
        x="2022-11-01",
        line_dash="dash",
        line_color="red",
    ).add_annotation(
        x="2022-11-01",
        y=-0.1,
        text="ChatGPT Release",
        showarrow=False,
        xanchor="center",
        yanchor="top",
        yref="paper",
    ).show()

    plt.show()
    plt.close()