In [6]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import plotly.express as px
import matplotlib.pyplot as plt

In [7]:
def compute_similarities_based_on_features(features_df, year_col, month_col, day_col):

    # Add week number to features_df
    features_df["date"] = pd.to_datetime(
        features_df[year_col].astype(int).astype(str)
        + "-"
        + features_df[month_col].astype(int).astype(str)
        + "-"
        + features_df[day_col].astype(int).astype(str)
    )
    features_df["week"] = features_df["date"].dt.isocalendar().week

    grouped = features_df.groupby([year_col, "week"])
    features_columns = [
        col
        for col in features_df.columns
        if (col.startswith("voc_") or col.startswith("lex_") or col.startswith("liwc_"))
    ]

    weeks = []
    years = []
    variances = defaultdict(list)
    means = defaultdict(list)

    for name, group in grouped:
        year, week = name
        for col in features_columns:
            try:
                variances[col].append(group[col].var())
                means[col].append(group[col].mean())
            except:
                variances[col].append(np.nan)
                means[col].append(np.nan)
        weeks.append(week)
        years.append(year)

    variances_df = pd.DataFrame({"year": years, "week": weeks})
    for col in features_columns:
        variances_df[f"similarity_{col}"] = variances[col]
        variances_df[f"mean_{col}"] = means[col]

    return variances_df

In [8]:
def clean_similarities(data):
    data["year"] = data["year"].astype(float).astype(int)
    data["week"] = data["week"].astype(float).astype(int)

    # only keep the rows with year >= 2018
    data = data[data["year"] >= 2018]

    # sort the data by year and month
    # Fill NaN values with mean of each column
    data = data.fillna(data.mean())
    data = data.set_index(["year", "week"]).sort_index().reset_index()

    return data

In [9]:
papers_features = pd.read_csv("papers/cl_cv_papers_features.csv")
news_features = pd.read_csv("news/news_features.csv")
reddit_features = pd.read_csv("reddit/reddit_features.csv")

  papers_features = pd.read_csv("papers/cl_cv_papers_features.csv")
  news_features = pd.read_csv("news/news_features.csv")


In [10]:
papers_features["final_date"] = pd.to_datetime(papers_features["final_date"])
papers_features["day"] = papers_features["final_date"].dt.day

In [11]:
reddit_features["created_utc"] = pd.to_datetime(reddit_features["created_utc"])
reddit_features["day"] = reddit_features["created_utc"].dt.day

In [12]:
papers_features = papers_features.dropna(subset=["year", "month", "day"])
news_features = news_features.dropna(subset=["year", "month", "day"])
reddit_features = reddit_features.dropna(subset=["year", "month", "day"])

In [None]:
papers_similarities = compute_similarities_based_on_features(
    papers_features, "year", "month", "day"
)
news_similarities = compute_similarities_based_on_features(
    news_features, "year", "month", "day"
)
reddit_similarities = compute_similarities_based_on_features(
    reddit_features, "year", "month", "day"
)

In [14]:
papers_clean_similarities = clean_similarities(papers_similarities)
news_clean_similarities = clean_similarities(news_similarities)
reddit_clean_similarities = clean_similarities(reddit_similarities)

In [15]:
papers_threshold = 0.82
papers_ai_detection_df = pd.read_csv("papers/cl_cv_papers_ai_written.csv")
papers_ai_detection_df["final_date"] = pd.to_datetime(
    papers_ai_detection_df["final_date"]
)
papers_ai_detection_df["week"] = (
    papers_ai_detection_df["final_date"].dt.isocalendar().week
)

papers_ai_detection_df["ai_written"] = (
    papers_ai_detection_df["ai_written"] < papers_threshold
)

papers_ai_detection_df = papers_ai_detection_df[papers_ai_detection_df["year"] >= 2018]
papers_ai_detection_df = (
    papers_ai_detection_df.groupby(["year", "week"])["ai_written"].mean().reset_index()
)

  papers_ai_detection_df = pd.read_csv("papers/cl_cv_papers_ai_written.csv")


In [16]:
news_threshold = 0.83
news_ai_detection_df = pd.read_csv("news/news_ai_written.csv")
news_ai_detection_df["date"] = pd.to_datetime(
    news_ai_detection_df["year"].astype(int).astype(str)
    + "-"
    + news_ai_detection_df["month"].astype(int).astype(str)
    + "-"
    + news_ai_detection_df["day"].astype(int).astype(str)
)
news_ai_detection_df["week"] = news_ai_detection_df["date"].dt.isocalendar().week


news_ai_detection_df["ai_written"] = news_ai_detection_df["ai_written"] < news_threshold

news_ai_detection_df = news_ai_detection_df[news_ai_detection_df["year"] >= 2018]
news_ai_detection_df = (
    news_ai_detection_df.groupby(["year", "week"])["ai_written"].mean().reset_index()
)

In [17]:
reddit_threshold = 0.87
reddit_ai_detection_df = pd.read_csv("reddit/reddit_ai_written.csv")
reddit_ai_detection_df["created_utc"] = pd.to_datetime(
    reddit_ai_detection_df["created_utc"]
)
reddit_ai_detection_df["week"] = (
    reddit_ai_detection_df["created_utc"].dt.isocalendar().week
)

reddit_ai_detection_df["ai_written"] = (
    reddit_ai_detection_df["ai_written"] < reddit_threshold
)

reddit_ai_detection_df = reddit_ai_detection_df[reddit_ai_detection_df["year"] >= 2018]
reddit_ai_detection_df = (
    reddit_ai_detection_df.groupby(["year", "week"])["ai_written"].mean().reset_index()
)

In [18]:
papers_clean_similarities

Unnamed: 0,year,week,similarity_lex_avg_word_length,mean_lex_avg_word_length,similarity_lex_avg_sent_length_by_char,mean_lex_avg_sent_length_by_char,similarity_lex_avg_sent_length_by_word,mean_lex_avg_sent_length_by_word,similarity_lex_special_char_count,mean_lex_special_char_count,...,similarity_liwc_relig,mean_liwc_relig,similarity_liwc_death,mean_liwc_death,similarity_liwc_assent,mean_liwc_assent,similarity_liwc_nonfl,mean_liwc_nonfl,similarity_liwc_filler,mean_liwc_filler
0,2018,1,0.172561,7.307538,832.039789,150.960982,15.976867,21.958084,0.001641,0.154852,...,0.000012,0.002445,1.169593e-06,0.000170,7.054309e-07,0.000133,0.000004,0.000719,0.000003,0.000391
1,2018,2,0.189494,7.237418,1113.009707,155.331594,23.177587,22.753178,0.002896,0.157301,...,0.000009,0.001578,2.244631e-06,0.000245,3.844527e-07,0.000071,0.000002,0.000475,0.000001,0.000237
2,2018,3,0.197138,7.314943,1160.273049,155.830053,20.561630,22.704009,0.001454,0.154850,...,0.000010,0.001945,9.259738e-07,0.000175,3.980952e-06,0.000260,0.000007,0.000692,0.000002,0.000336
3,2018,4,0.251596,7.170824,784.735574,151.325989,14.065721,22.161425,0.001470,0.150903,...,0.000011,0.001890,3.276712e-06,0.000223,1.341764e-06,0.000193,0.000007,0.000776,0.000004,0.000375
4,2018,5,0.206723,7.183508,1376.853383,154.641491,29.922524,22.794156,0.001282,0.147586,...,0.000009,0.001637,4.502314e-07,0.000093,3.888846e-07,0.000087,0.000005,0.000817,0.000003,0.000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,2024,42,0.196291,7.599473,772.249002,163.650653,14.862925,22.554869,0.002086,0.177594,...,0.000025,0.003952,1.541024e-06,0.000087,8.411598e-07,0.000093,0.000003,0.000531,0.000005,0.000602
356,2024,43,0.202893,7.554698,1704.550412,165.482817,26.658731,22.913977,0.002349,0.178908,...,0.000018,0.003318,5.760000e-07,0.000073,1.348313e-06,0.000113,0.000004,0.000678,0.000004,0.000602
357,2024,44,0.194812,7.546029,1098.058386,163.304475,19.292829,22.644112,0.002267,0.176944,...,0.000017,0.003029,5.525907e-07,0.000096,2.888332e-06,0.000215,0.000005,0.000619,0.000005,0.000695
358,2024,45,0.223838,7.609850,10753.584277,172.207578,180.283435,23.625760,0.002560,0.179222,...,0.000020,0.003352,5.634324e-07,0.000078,7.167614e-06,0.000233,0.000003,0.000500,0.000004,0.000506


In [19]:
papers_clean_similarities = papers_clean_similarities.drop(
    [col for col in papers_clean_similarities.columns if col.startswith("mean_")],
    axis=1,
)
news_clean_similarities = news_clean_similarities.drop(
    [col for col in news_clean_similarities.columns if col.startswith("mean_")], axis=1
)
reddit_clean_similarities = reddit_clean_similarities.drop(
    [col for col in reddit_clean_similarities.columns if col.startswith("mean_")],
    axis=1,
)

In [21]:
columns_about_complexity = [
    "similarity_voc_simpson_index",
    "similarity_voc_shannon_entropy",
    "similarity_lex_avg_dependency_link_length",
    "similarity_voc_type_token_ratio",
    "similarity_voc_hapax_legomena",
]

In [24]:
# in each of the dataframes, keep only the complexity-related columns and also year and week

papers_clean_similarities = papers_clean_similarities[
    ["year", "week"] + columns_about_complexity
]
news_clean_similarities = news_clean_similarities[
    ["year", "week"] + columns_about_complexity
]
reddit_clean_similarities = reddit_clean_similarities[
    ["year", "week"] + columns_about_complexity
]

In [25]:
papers_merged_df = pd.merge(
    papers_clean_similarities,
    papers_ai_detection_df,
    on=["year", "week"],
    how="left",
    suffixes=("", "_ai"),
)


news_merged_df = pd.merge(
    news_clean_similarities,
    news_ai_detection_df,
    on=["year", "week"],
    how="left",
    suffixes=("", "_ai"),
)


reddit_merged_df = pd.merge(
    reddit_clean_similarities,
    reddit_ai_detection_df,
    on=["year", "week"],
    how="left",
    suffixes=("", "_ai"),
)

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
# for each of the columns in complexity_columns, and also ai_written, normalize them
for df in [papers_merged_df, news_merged_df, reddit_merged_df]:
    for col in columns_about_complexity + ["ai_written"]:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]])

    df["complexity"] = np.mean(
        [
            df["similarity_voc_simpson_index"],
            df["similarity_voc_shannon_entropy"],
            df["similarity_lex_avg_dependency_link_length"],
            df["similarity_voc_type_token_ratio"],
            df["similarity_voc_hapax_legomena"],
        ],
        axis=0,
    )

In [34]:
papers_merged_df.to_csv("papers/papers_merged_weekly_df.csv", index=False)
news_merged_df.to_csv("news/news_merged_weekly_df.csv", index=False)
reddit_merged_df.to_csv("reddit/reddit_merged_weekly_df.csv", index=False)