In [1]:
# import required packages
import json
import csv
import spacy
import numpy as np

In [2]:
# load the Spacy language model
nlp = spacy.load("en_core_web_lg")

# a default list of stop words set by the Spacy language model
stopwords = nlp.Defaults.stop_words
# print(stopwords)


# variables to store term statistics
num_of_comments = 0
unique_word = set() # using the set-type variable since it does not allow duplicates > able to count the number of unique words
num_of_token_per_comment = [] # using the list-type varailbe since we want to measure corpus-level statistics (e.g., avg, max, min, median, etc.)
num_of_token_per_comment_without_stop_words = []
total_number_of_tokens = 0 # in a corpus
unique_author = set() # using the set-type variable since it does not allow duplicates > able to count the number of unique authors
time_stamp_list = [] # able to measure the number of comments by day, week, etc.
reply_count = 0
unique_submission = set() # using the set-type variable since it does not allow duplicates > able to count the number of unique submissions

# data processing
with open("CovidVaccinated.json", "r") as file: # I customized the PRAW code to store data as a json file
    for line in file: # for each data point, do the following
        line_data = json.loads(line)
        text = line_data["comment_text"]
        doc = nlp(text)
        num_of_comments += 1
        # statistics regarding words
        num_of_tokens = len(doc)
        total_number_of_tokens += num_of_tokens
        token_count_without_stop_words = 0
        for token in doc:
            if token.is_stop is True: # check whether the provided token is a stop word and decide whether to disregard it
                pass
            else:
                unique_word.add(str(token).lower())
                token_count_without_stop_words += 1
        num_of_token_per_comment.append(num_of_tokens)
        num_of_token_per_comment_without_stop_words.append(token_count_without_stop_words)
        # statistics regarding authors
        author_name = line_data["author_name"]
        unique_author.add(author_name.lower())
        # statistics regarding time_stamp
        time_stamp = line_data["timestamp"]
        time_stamp_list.append(time_stamp)
        # statistics regarding replies
        reply_to = line_data["reply_to"]
        if reply_to == "-":
            pass
        else:
            reply_count += 1
        # statistics regarding submissions
        thread_id = line_data["thread_id"]
        unique_submission.add(thread_id)

# statistics
print("number of comments:", num_of_comments)
print("number of unique words:", len(unique_word))
print("total number of words in the corpus:", total_number_of_tokens)
print("average number of words in comments:", np.mean(np.asarray(num_of_token_per_comment)))
print("average number of words in comments without stop words:", np.mean(np.asarray(num_of_token_per_comment_without_stop_words)))
print("maximum number of words in comments:", np.max(np.asarray(num_of_token_per_comment)))
print("maximum number of words in comments without stop words:", np.max(np.asarray(num_of_token_per_comment_without_stop_words)))
print("minimum number of words in comments:", np.min(np.asarray(num_of_token_per_comment)))
print("minimum number of words in comments without stop words:", np.min(np.asarray(num_of_token_per_comment_without_stop_words)))
print("median number of words in comments:", np.median(np.asarray(num_of_token_per_comment)))
print("median number of words in comments without stop words:", np.median(np.asarray(num_of_token_per_comment_without_stop_words)))
print("number of unique authors:", len(unique_author))
print("number of comments replying to other comments:", reply_count)
print("number of sumbissions:", len(unique_submission))

number of comments: 10865
number of unique words: 16895
total number of words in the corpus: 569150
average number of words in comments: 52.38380119650253
average number of words in comments without stop words: 25.849700874367233
maximum number of words in comments: 1398
maximum number of words in comments without stop words: 876
minimum number of words in comments: 1
minimum number of words in comments without stop words: 0
median number of words in comments: 32.0
median number of words in comments without stop words: 15.0
number of unique authors: 2736
number of comments replying to other comments: 6430
number of sumbissions: 866
