# Data Characterization

In [None]:
import pandas as pd # type: ignore
dd = pd.read_csv("Dataset/dataset.csv")

In [None]:
# Get most used words in the body column

from collections import Counter
import re

# Get the most used words in the body column
def get_most_used_words(df, n):
    # Get all the words in the body column
    all_words = ' '.join(df['body']).split()
    # Count the number of times each word appears
    word_counts = Counter(all_words)
    # Get the n most common words
    return word_counts.most_common(n)

# Get the 10 most used words in the body column and their counts
most_used_words = get_most_used_words(dd, 10)
print(most_used_words)

In [None]:
# Get the most used words in the body column that are not stop words
import nltk
from nltk.corpus import stopwords

# Get the most used words in the body column that are not stop words
def get_most_used_words_without_stop_words(df, n):
    # Get all the words in the body column
    all_words = ' '.join(df['body']).split()
    # Get the stop words
    stop_words = set(stopwords.words('english'))
    # Count the number of times each word appears
    word_counts = Counter(all_words)
    # Get the n most common words that are not stop words
    return [(word, count) for word, count in word_counts.most_common() if word not in stop_words][:n]

# Get the 10 most used words in the body column that are not stop words and their counts
most_used_words_without_stop_words = get_most_used_words_without_stop_words(dd, 10)
print(most_used_words_without_stop_words)


In [None]:
# Get the month/year combos with the most posts
import calendar

# Get the months with the most posts
def get_months_with_most_posts(df):
    # Get the month of each post
    months = df['creation_date'].str.split('-').str[1]
    # Count the number of posts for each month
    month_counts = Counter(months)
    # Get the months with the most posts
    return [calendar.month_name[int(month)] for month, _ in month_counts.most_common()]

# Get the months with the most posts
months_with_most_posts = get_months_with_most_posts(dd)
print(months_with_most_posts)

# plot it
import matplotlib.pyplot as plt

# Get the number of posts for each month
months = dd['creation_date'].str.split('-').str[1]
month_counts = Counter(months)
# Get the months
months = [calendar.month_name[int(month)] for month, _ in month_counts.items()]
months.reverse()
# Get the number of posts
counts = list(month_counts.values())
counts.reverse()
# Plot the number of posts for each month in month calendar order
plt.bar(months, counts)
plt.xticks(rotation=45)
plt.xlabel('Month')
plt.ylabel('Number of Posts')
plt.title('Number of Posts per Month')
plt.show()


In [None]:
# Get the years in order of most posts
def get_years_in_order_of_most_posts(df):
    # Get the year of each post
    years = df['creation_date'].str.split('-').str[0]
    # Count the number of posts for each year
    year_counts = Counter(years)
    # Get the years in order of most posts
    return [year for year, _ in year_counts.most_common()]

# Get the years in order of most posts
years_in_order_of_most_posts = get_years_in_order_of_most_posts(dd)
print(years_in_order_of_most_posts)

In [None]:
# Get the max, min, mode, and mean of the number of comments
max_comments = dd['num_comments'].max()
min_comments = dd['num_comments'].min()
mode_comments = dd['num_comments'].mode()
mean_comments = dd['num_comments'].mean()
print(max_comments, min_comments, mode_comments, mean_comments)

In [None]:
# Get some stats about the upvote ratio

min_ratio = dd['upvote_ratio'].min()
mean_ratio = dd['upvote_ratio'].mean()

print(min_ratio, mean_ratio)

In [None]:
# Number of unique values per column
unique_counts = dd.nunique()

# Missing values per column
missing_values = dd.isnull().sum()

# Value counts for categorical columns
author_counts = dd['author'].value_counts()
subreddit_counts = dd['subreddit'].value_counts()
subreddit_counts = dd['SubReddit'].value_counts()

# Display the results
print("\nNumber of Unique Values per Column:\n", unique_counts)
print("\nMissing Values per Column:\n", missing_values)
print("\nAuthor Counts:\n", author_counts)
print("\nSubreddit Counts:\n", subreddit_counts)

In [None]:
def get_mean_stats_per_subreddit(df):
    # Get the mean number of comments, upvote ratio, and score for each subreddit
    return df.groupby('SubReddit')[['num_comments', 'upvote_ratio', 'score']].mean()

# Get the mean number of comments, upvote ratio, and score for each subreddit
mean_stats_per_subreddit = get_mean_stats_per_subreddit(dd)
print(mean_stats_per_subreddit)
