In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/combined_for_analysis_sensationalism_jargon_categories_domain_labels.csv")
df["log_score"] = np.log(df["score"] + 1)

df.dropna(inplace=True)
df["top_domain"] = "other"
for domain in ["is_top_domain_repo", "is_top_domain_news", "is_top_domain_scientific", "is_top_domain_social_media","is_top_domain_scam"]:
    # set domain column to a value
    df.loc[df[domain], "top_domain"] = " ".join(domain.split("_")[3:])
df.info()
df

In [None]:
domain_order_for_hue = ["repo", "scientific", "news", "social_media", "scam"]
selected_categories = ["Medicine", "Engineering", "Economics", "Social Science", "Computer Science", "Epidemiology"]
top_categories_order = ["Life Sciences", "Social Sciences", "Environmental Sciences", "Engineering & Technology", "Physical Sciences"]

# Jargon over different domains and categories

In [None]:
plt.figure(figsize=(12, 6))

sns_plot = sns.barplot(data=df, x = "year", y = "jargon_proportion", hue="label_voting_manual", hue_order=domain_order_for_hue)
plt.title("Jargon Proportion over Different Domains over time")
plt.xlabel("Year")
plt.ylabel("Jargon Proportion")
plt.legend(title="Domain Category", loc="upper right", handles = sns_plot.get_legend_handles_labels()[0] + [plt.Line2D([], [], color="black", label="Confidence Interval (99%)")])
# add line for the jargon proportion trends
#sns.lineplot(data=df, x="year", y="jargon_proportion", hue="label_voting_manual", hue_order=domain_order_for_hue, legend=False)
plt.show()
# TODO regress jargon on the domain

In [None]:
# How does jargon in domains relate over different link flairs
# fig width
plt.figure(figsize=(12, 6))
sns_plot = sns.barplot(data=df, x="top_category", y="jargon_proportion", hue="label_voting_manual",  hue_order=["repo", "scientific", "news", "social_media", "scam"], order=top_categories_order, errorbar=("ci", 99))
plt.xticks(rotation=45)
handles = sns_plot.get_legend_handles_labels()
# add error bars for the 50th percentile to the legend
plt.legend(title="Domain Category", loc="upper right", handles = handles[0] + [plt.Line2D([], [], color="black", label="Confidence Interval (99%)")])
plt.title("Jargon Proportion over Different Categories and Domains")
plt.xlabel("Category")
plt.ylabel("Jargon Proportion")

In [None]:

plt.figure(figsize=(12, 6))
sns_plot = sns.barplot(data=df[df["link_flair_text"].isin(selected_categories)], x="link_flair_text", y="jargon_proportion", hue="label_voting_manual",  hue_order=domain_order_for_hue, order=selected_categories)
plt.legend(title="Domain Category", loc="upper right", handles = sns_plot.get_legend_handles_labels()[0] + [plt.Line2D([], [], color="black", label="Confidence Interval (99%)")])
plt.xticks(rotation=45)

In [None]:

plt.figure(figsize=(12, 6))
sns_plot = sns.barplot(data=df[df["link_flair_text"].isin(selected_categories)], x="link_flair_text", y="score", hue="label_voting_manual",  hue_order=domain_order_for_hue, order=selected_categories)
plt.legend(title="Domain Category", loc="upper right", handles = sns_plot.get_legend_handles_labels()[0] + [plt.Line2D([], [], color="black", label="Confidence Interval (99%)")])
plt.xticks(rotation=45)
# TODO does this also translate to the score?

In [None]:
sns_plot = sns.lineplot(data=df, x="year", y="jargon_proportion", hue="link_flair_text", palette ="rocket", hue_order=selected_categories)
sns_plot.legend(bbox_to_anchor=(1.05, .95), loc='upper left', borderaxespad=0.)


# Jargon Development over the years

In [None]:
pal_id = "Set2"

sns_plot = sns.lineplot(data=df, x="year", y="jargon_proportion", hue="top_category")

sns_plot.legend(bbox_to_anchor=(1.05, .95), loc='upper left', borderaxespad=0.)
plt.title("Jargon Proportion in Scientific Domains over years")
plt.xlabel("Year")
plt.ylabel("Jargon Proportion")




In [None]:
g = sns.FacetGrid(df, col="top_category", col_wrap=2, height=4, sharey=True)

def plot_table(data, color, **kwargs):
    average_jargon = data.groupby("link_flair_text")["jargon_proportion"].mean().sort_values(ascending=False)
    top_category = data["top_category"].iloc[0]
    palette = sns.color_palette(pal_id, len(average_jargon))
    sns_plot = sns.lineplot(data=data, x="year", y="jargon_proportion", hue="link_flair_text", **kwargs, legend=False, palette=palette,hue_order=average_jargon.index)
    sns_plot.legend(title=top_category, loc="upper right", handles = [ plt.Line2D([], [], color=palette[i], label=flair) for (i, flair) in enumerate(average_jargon.index)])
    sns_plot.set_title(top_category)
    plt.ylabel("Jargon Proportion")

    

g.map_dataframe(plot_table)
unique_top_categories = df["top_category"].unique()
# Create a custom legend
handles = []
for i, cat in enumerate(unique_top_categories):
    flairs = df[df["top_category"] == cat]["link_flair_text"].unique()
    handles.append(plt.Line2D([], [], color="white", label=f"\n{cat}"))
    palette = sns.color_palette(pal_id, len(flairs))
    for i, flair in enumerate(flairs):
        handles.append(
            plt.Line2D([], [], color=palette[i], label=flair)
        )

plt.legend(handles=handles, title="Reddit Categories (Grouped by Field)", loc="center left", bbox_to_anchor=(1, 0.5), ncol=2)
plt.subplots_adjust(right=0.85)

In [None]:
# genetics over the years


In [None]:
sns_plot = sns.lineplot(data=df, x="month", y="jargon_proportion", hue="year", palette="magma")

In [None]:
# normalized jargon by year
# show the same as above normalized over the yearly average
yearly_averages = df.groupby(by="year")["jargon_proportion"].mean()
df["jargon_norm_year"] = df.apply(
    lambda x: x["jargon_proportion"] - yearly_averages[ x["year"]],
    axis = 1
)

In [None]:
sns_plot = sns.lineplot(data=df, x="month", y="jargon_norm_year", hue="year", palette="magma")


In [None]:
sns_plot = sns.lineplot(data=df, x="month", y="score", hue="year", palette="magma")

In [None]:
# normalized jargon by year
# show the same as above normalized over the yearly average
yearly_averages = df.groupby(by="year")["score"].mean()
df["score_year_norm"] = df.apply(
    lambda x: x["score"] - yearly_averages[ x["year"]],
    axis = 1
)

In [None]:
sns.lineplot(data=df, x="month", y="score_year_norm", hue="year", palette="magma")


In [None]:
sns.lineplot(data=df, x="year", y="score", palette="magma")


In [None]:
df['year_month'] = df.apply(lambda x: str(x['year']) + '_' + str(x['month']), axis=1)

# Convert year_month to datetime for proper ordering
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y_%m')

# Sort the DataFrame by year_month
df = df.sort_values('year_month')

# Plot the line plot
plt.figure(figsize=(12, 6))
sns.lmplot(data=df, x="month", y="score", scatter=False, legend=False)
sns.lineplot(data=df, x="month", y="score", palette="magma")

plt.title('Score Over Months')
plt.xlabel('Month')
plt.ylabel('Score')
plt.show()

# Jargon Propotion over different categories

In [None]:
average_jargon = df.groupby("top_category")["jargon_proportion"].mean().sort_values(ascending=False)
median_jargon = df.groupby("top_category")["jargon_proportion"].median().sort_values(ascending=False)
median_df = pd.DataFrame({
    "top_category": median_jargon.index,
    "median_jargon_proportion": median_jargon.values
})

plot = sns.barplot(data=df, x="top_category", y="jargon_proportion", errorbar=("ci", 99),  order=average_jargon.index, palette="rocket")
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)
plt.title("Average Jargon Proportion by Reddit Category")
# add legend for error
plt.legend(title="Error  bars", loc="upper right", labels=["Confidence Interval (99%)"])
plt.xlabel("Category")
plt.ylabel("Average Jargon Proportion")
# add a dot for the median in this
sns.scatterplot(data=median_df, x="top_category", y="median_jargon_proportion", s=100, hue="top_category", palette="rocket_r", legend=False)

In [None]:
plt.figure(figsize=(12, 6))
average_jargon = df.groupby("link_flair_text")["jargon_proportion"].mean().sort_values(ascending=False)
median_jargon = df.groupby("link_flair_text")["jargon_proportion"].median().sort_values(ascending=False)
median_df = pd.DataFrame({
    "link_flair_text": median_jargon.index,
    "median_jargon_proportion": median_jargon.values
})

plot = sns.barplot(data=df, x="link_flair_text", y="jargon_proportion", errorbar=("ci", 99),  order=average_jargon.index, palette="rocket")
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)
plt.title("Average Jargon Proportion by Reddit Category")
# add legend for error
plt.legend(title="Error  bars", loc="upper right", labels=["Confidence Interval (99%)"])
plt.xlabel("Category")
plt.ylabel("Average Jargon Proportion")
# add a dot for the median in this
sns.scatterplot(data=median_df, x="link_flair_text", y="median_jargon_proportion", s=10, hue="link_flair_text", palette="rocket_r", legend=False)

In [None]:
plt.figure(figsize=(12, 6))
average_jargon = df.groupby("link_flair_text")["score"].mean().sort_values(ascending=False)
median_jargon = df.groupby("link_flair_text")["score"].median().sort_values(ascending=False)
median_df = pd.DataFrame({
    "link_flair_text": median_jargon.index,
    "median_jargon_proportion": median_jargon.values
})

plot = sns.barplot(data=df, x="link_flair_text", y="score", errorbar=("ci", 99),  order=average_jargon.index, palette="rocket")
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)
plt.title("Average Score by Reddit Category")
# add legend for error
plt.legend(title="Error  bars", loc="upper right", labels=["Percentile Interval (50%)"])
plt.xlabel("Category")
plt.ylabel("Average Score")
# add a dot for the median in this
sns.scatterplot(data=median_df, x="link_flair_text", y="median_jargon_proportion", s=10, hue="link_flair_text", palette="rocket_r", legend=False)

# Jargon Distribution

In [None]:
score_distribution = df['score'].value_counts().sort_index()
sns.scatterplot( x = score_distribution.index, y =  score_distribution.values, s=2)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Score (log scale)')
plt.ylabel('Frequency (log scale)')
plt.title('Power-Law Distribution of Scores')

In [None]:
sns.displot(df, x = "jargon_proportion", kind="kde", fill=True, hue="top_category")
plt.title('Distribution of Jargon Proportion')

 # Jargon regression and scatters

In [None]:
plt.figure(figsize=(12, 6))
sns_plot = sns.lmplot(data=df, x="jargon_proportion", y="log_score", hue="top_category", hue_order=top_categories_order, legend=False, palette=pal_id, scatter=False)
sns.kdeplot(data=df, x="jargon_proportion", y="log_score", levels=3, alpha=0.5, legend=False, hue_order=top_categories_order, hue="top_category", palette=pal_id)
handles = [plt.Line2D([], [], color=sns.color_palette(pal_id, len(domain_order_for_hue))[i], label=domain) for i, domain in enumerate(top_categories_order)]
sns_plot.add_legend(title="Science Category", handles=handles)
plt.ylabel("Log-transformed Score")
plt.xlabel("Jargon Proportion")

In [None]:
plt.figure(figsize=(12, 6))
sns_plot = sns.lmplot(data=df[df["jargon_proportion"] > 0], x="jargon_proportion", y="log_score", hue="top_category", hue_order=top_categories_order, legend=False, palette=pal_id, scatter=False)
sns.kdeplot(data=df[df["jargon_proportion"] > 0], x="jargon_proportion", y="log_score", levels=3, alpha=0.5, legend=False, hue_order=top_categories_order, hue="top_category", palette=pal_id)
handles = [plt.Line2D([], [], color=sns.color_palette(pal_id, len(domain_order_for_hue))[i], label=domain) for i, domain in enumerate(top_categories_order)]
sns_plot.add_legend(title="Science Category", handles=handles)
plt.ylabel("Log-transformed Score")
plt.xlabel("Jargon Proportion")

In [None]:
plt.figure(figsize=(12, 6))
sns_plot = sns.lmplot(data=df, x="jargon_proportion", y="log_score", hue="label_voting_manual", hue_order=domain_order_for_hue, legend=False, palette=pal_id, scatter=False)
sns.kdeplot(data=df, x="jargon_proportion", y="log_score", levels=3, alpha=0.5, legend=False, hue_order=domain_order_for_hue, hue="label_voting_manual", palette=pal_id)
handles = [plt.Line2D([], [], color=sns.color_palette(pal_id, len(domain_order_for_hue))[i], label=domain) for i, domain in enumerate(domain_order_for_hue)]
sns_plot.add_legend(title="Domain Category", handles=handles)
plt.ylabel("Log-transformed Score")
plt.xlabel("Jargon Proportion")


In [None]:
plt.figure(figsize=(12, 6))
sns_plot = sns.lmplot(data=df[df["jargon_proportion"] > 0], x="jargon_proportion", y="log_score", hue="label_voting_manual", hue_order=domain_order_for_hue, legend=False, palette=pal_id, scatter=False)
sns.kdeplot(data=df[df["jargon_proportion"] > 0], x="jargon_proportion", y="log_score", levels=3, alpha=0.5, legend=False, hue_order=domain_order_for_hue, hue="label_voting_manual", palette=pal_id)
handles = [plt.Line2D([], [], color=sns.color_palette(pal_id, len(domain_order_for_hue))[i], label=domain) for i, domain in enumerate(domain_order_for_hue)]
sns_plot.add_legend(title="Domain Category", handles=handles)
plt.ylabel("Log-transformed Score")
plt.xlabel("Jargon Proportion")


In [None]:
filtered_df = df[df["link_flair_text"].isin(selected_categories) & df["jargon_proportion"] > 0]
plt.figure(figsize=(12, 6))
# Create a faceted plot with separate plots for each category
sns_plot = sns.lmplot(data=filtered_df, x="jargon_proportion", y="log_score", col="link_flair_text", col_order=selected_categories, col_wrap=2, height=4, hue="label_voting_manual", hue_order=domain_order_for_hue, palette=pal_id, legend=False, scatter=False)

sns_plot.map(sns.kdeplot, "jargon_proportion", "log_score", levels=3, alpha=0.5, legend=False)

# Set axis labels for clarity
sns_plot.set_axis_labels("Jargon Proportion", "Log-transformed Score")
sns_plot.set_titles("{col_name}")

# Add a legend
handles = [plt.Line2D([], [], color=sns.color_palette(pal_id, len(domain_order_for_hue))[i], label=domain) for i, domain in enumerate(domain_order_for_hue)]
sns_plot.add_legend(title="Domain Category", loc="center right", handles=handles )

# Show the plot
# add some space above
plt.subplots_adjust(top=0.9)

sns_plot.fig.suptitle("Jargon vs. Log-transformed Score by Category")

In [None]:
plt.figure(figsize=(12, 6))
# Create a faceted plot with separate plots for each category
sns_plot = sns.lmplot(data=df, x="jargon_proportion", y="log_score", col="top_category", col_order=top_categories_order, col_wrap=2, height=4, hue="link_flair_text", palette=pal_id, legend=False, scatter=False)

sns_plot.map(sns.kdeplot, "jargon_proportion", "log_score", levels=3, alpha=0.5, legend=False)

# Set axis labels for clarity
sns_plot.set_axis_labels("Jargon Proportion", "Log-transformed Score")
sns_plot.set_titles("{col_name}")

# Add a legend

handles = []
for cate in top_categories_order:
    flairs = df[df["top_category"] == cate]["link_flair_text"].unique()
    # add sub header:
    handles.append(plt.Line2D([], [], color="white", label=f"\n{cate}"))
    handles.extend([plt.Line2D([], [], color=sns.color_palette(pal_id, len(flairs))[i], label=flair) for i, flair in enumerate(flairs)])
sns_plot.add_legend(title="Domain Category", loc="center right", handles=handles )

# Show the plot
# add some space above
plt.subplots_adjust(top=0.9)

sns_plot.fig.suptitle("Jargon vs. Log-transformed Score by Category")

In [None]:
g = sns.FacetGrid(df, col="top_category", col_wrap=2, height=4, sharey=True)

def plot_table(data, color, **kwargs):
    top_category = data["top_category"].iloc[0]
    link_flairs = data["link_flair_text"].unique()
    sns_plot = sns.lmplot(data=data, x="jargon_proportion", y="log_score", hue="link_flair_text", palette=pal_id, scatter=False)
    sns.kdeplot(data= data, x= "jargon_proportion",y= "log_score", levels=3, alpha=0.5, legend=False, hue="link_flair_text", palette=pal_id)
    sns_plot.legend(title=top_category, loc="upper right", handles = [ plt.Line2D([], [], color=palette[i], label=flair) for (i, flair) in enumerate(link_flairs)])
    sns_plot.set_title(top_category)
    plt.ylabel("Log-transformed Score")
    plt.xlabel("Jargon Proportion")

    

g.map_dataframe(plot_table)
unique_top_categories = df["top_category"].unique()
# Create a custom legend
handles = []
for i, cat in enumerate(unique_top_categories):
    flairs = df[df["top_category"] == cat]["link_flair_text"].unique()
    handles.append(plt.Line2D([], [], color="white", label=f"\n{cat}"))
    palette = sns.color_palette(pal_id, len(flairs))
    for i, flair in enumerate(flairs):
        handles.append(
            plt.Line2D([], [], color=palette[i], label=flair)
        )

plt.legend(handles=handles, title="Reddit Categories (Grouped by Field)", loc="center left", bbox_to_anchor=(1, 0.5), ncol=2)
plt.subplots_adjust(right=0.85)

In [None]:
handles = []
for cate in top_categories_order:
    flairs = df[df["top_category"] == cate]["link_flair_text"].unique()
    # add sub header:
    handles.append(plt.Line2D([], [], color="white", label=f"\n{cate}"))
    handles.extend([plt.Line2D([], [], color=sns.color_palette(pal_id, len(flairs))[i], label=flair) for i, flair in enumerate(flairs)])
sns_plot.add_legend(title="Domain Category", loc="center right", handles=handles )
sns_plot.add_legend(title="Domain Category", loc="center right", handles=handles )


In [None]:
sns.lmplot(data=df[df["link_flair_text"] == "Computer Science"], x="sensationalism_score", y="log_score", scatter_kws={'s':2})

In [None]:
df["score_plus_one"] = df["score"] + 1
sns.lmplot(data=df[df["link_flair_text"] == "Computer Science"], x="jargon_proportion", y="score_plus_one", scatter_kws={'s':2})
sns.histplot(data=df[df["link_flair_text"] == "Computer Science"], x="jargon_proportion", y= "score_plus_one", bins=20, cbar=True, pmax=.3, pthresh=.05, log_scale=(False, True))

In [None]:
# plot boxplot of scores for each interval of jargon_proportion (0-1, 1-10, 10-20, ...)

bins = [0, 5, 20, 35, 50, 65, 80]
labels = ['[0, 5)', '[5, 20)', '[20, 35)', '[35, 50)', '[50, 65)', '[65, 80)']


df['jargon_interval'] = pd.cut(df['jargon_proportion'] * 100, bins=bins, labels=labels, right=False)

# Plot the boxplot
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='jargon_interval', y='score')

plt.title('Boxplot of Scores for Each Interval of Jargon Proportion')
plt.xlabel('Jargon Proportion Interval (%)')
plt.ylabel('Score')
plt.yscale("log")
plt.xticks(rotation=45)





In [None]:
sns.lmplot(df, x= "jargon_proportion", y = "log_score", hue='jargon_interval', scatter=False)
sns.kdeplot(df, x= "jargon_proportion", y = "log_score", hue='jargon_interval', legend=False)

In [None]:
plt.figure(figsize=(12,6))
line_kws = {'linestyle': '--', 'linewidth': 2}
sns.lmplot(df, x= "jargon_proportion", y = "score", hue='jargon_interval', scatter=False, line_kws=line_kws)

plt.yscale('log')

plt.figure(figsize=(12, 6))
sns.kdeplot(df, x='jargon_proportion')



In [None]:
# show me how the proportion for each top_category changed over the years
import pandas as pd

# Assuming df is your DataFrame
# Group by year and top_category and count the number of posts
category_counts = df.groupby(['year', 'top_category']).size().reset_index(name='count')

# Calculate the total number of posts per year
total_counts_per_year = df.groupby('year').size().reset_index(name='total_count')

# Merge the counts with the total counts per year
category_proportions = pd.merge(category_counts, total_counts_per_year, on='year')

# Calculate the proportion of each category per year
category_proportions['proportion'] = category_proportions['count'] / category_proportions['total_count']

# Display the result
print(category_proportions)
category_counts = df.groupby(['year', 'top_category']).size().reset_index(name='count')

# Calculate the total number of posts per year
total_counts_per_year = df.groupby('year').size().reset_index(name='total_count')

# Merge the counts with the total counts per year
category_proportions = pd.merge(category_counts, total_counts_per_year, on='year')

# Calculate the proportion of each category per year
category_proportions['proportion'] = category_proportions['count'] / category_proportions['total_count']

# Plot the lineplot
plt.figure(figsize=(12, 6))
sns.lineplot(data=category_proportions, x='year', y='proportion', hue='top_category', marker='o')
plt.title('Proportion of Posts for Each Category per Year')
plt.xlabel('Year')
plt.ylabel('Proportion of Posts')
plt.legend(title='Top Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
sns.displot(
    data=df[df['top_category'] == "Life Sciences"],
    x="year",
    hue="link_flair_text",
    kind="hist",
    multiple="fill",
    palette=pal_id
)

In [None]:
import pandas as pd

# Filter the DataFrame for rows where 'jargon_proportion' is between 0 and 0.05
filtered_df = df[(df['jargon_proportion'] > 0) & (df['jargon_proportion'] < 0.05) & (df['score'] > 1) ]

sns.regplot(df,x = 'jargon_proportion', y = 'score')
plt.yscale('log')


