Retrive data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("redditADHD2012_preprocessed.csv")
print(df.head)

Average post lengths in number of words for each label

In [None]:
df['word_count'] = df['combined_text'].apply(lambda x: len(x.split()))

bins = list(range(0, 850, 50))
labels = [f'{i+50}' for i in bins[:-1]]

df['word_count_bin'] = pd.cut(df['word_count'], bins=bins, labels=labels, right=False)

word_count_percentage = (df.groupby(['label', 'word_count_bin'])
                         .size()
                         .groupby(level=0)
                         .apply(lambda x: 100 * x / float(x.sum()))
                         .reset_index(name='percentage'))

word_count_pivot = word_count_percentage.pivot(index='word_count_bin', columns='label', values='percentage').fillna(0)


Plot

In [None]:
colors = ["#7eb0d5", "#fd7f6f", "#b2e061"]
fig, ax = plt.subplots(figsize=(14, 7))
word_count_pivot.plot(kind='bar', ax=ax, color=colors)

plt.xlabel('Word Count', fontsize=14)
plt.ylabel('Percentage of Posts', fontsize=14)
plt.legend(title='Label', labels=['Neutral', 'Self-diagnosis', 'Self-medication'], fontsize=12)

ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=90)
ax.set_xlim(left=-1, right=len(labels))

plt.tight_layout()

plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

Max word count, word mean and cutoff for max word count

In [None]:
max_word_count = df['word_count'].max()
mean_word_count = df['word_count'].mean()

total_count_per_bin = df['word_count_bin'].value_counts(sort=False)
cumulative_percentage = total_count_per_bin.cumsum() / total_count_per_bin.sum() * 100
cutoff_bin = cumulative_percentage[cumulative_percentage >= 99].index[0] if any(cumulative_percentage >= 99) else None
max_word_count, mean_word_count, cutoff_bin


Average post lengths in number of characters for each label

In [None]:
df['char_count'] = df['combined_text'].apply(lambda x: len(x))
bins = list(range(0, 4250, 250))

labels = [f'{i+250}' for i in bins[:-1]]

df['char_count_bin'] = pd.cut(df['char_count'], bins=bins, labels=labels, right=False)

char_count_percentage = (df.groupby(['label', 'char_count_bin'])
                         .size()
                         .groupby(level=0)
                         .apply(lambda x: 100 * x / float(x.sum()))
                         .reset_index(name='percentage'))

char_count_pivot = char_count_percentage.pivot(index='char_count_bin', columns='label', values='percentage').fillna(0) 

Plot

In [None]:
colors = ["#7eb0d5", "#fd7f6f", "#b2e061"]

fig, ax = plt.subplots(figsize=(14, 7))
char_count_pivot.plot(kind='bar', ax=ax, color=colors)

plt.xlabel('Character Count', fontsize=14)
plt.ylabel('Percentage of Posts', fontsize=14)
plt.legend(title='Label', labels=['Neutral', 'Self-diagnosis', 'Self-medication'], fontsize=12)

ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=90)
ax.set_xlim(left=-1, right=len(labels))

plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

Max char count, char mean and cutoff for char word count

In [None]:
max_char_count = df['char_count'].max()
mean_char_count = df['char_count'].mean()

total_count_per_char_bin = df['char_count_bin'].value_counts(sort=False)
cumulative_percentage = total_count_per_char_bin.cumsum() / total_count_per_char_bin.sum() * 100
cutoff_char_bin = cumulative_percentage[cumulative_percentage >= 99].index[0] if any(cumulative_percentage >= 99) else None

max_char_count, mean_char_count, cutoff_char_bin
