In [31]:
import praw
from praw.models import MoreComments
from pmaw import PushshiftAPI
import datetime as dt

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import emoji

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\xmega\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xmega\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xmega\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
#connect to my reddit app
def connect_to_reddit():
    reddit = praw.Reddit(
        client_id = "Xz-Yl6lYPoGhjn4RkrBeyQ",
        client_secret = "WSsEPYoZKxU1qi1JKMrhZ86YoqiKVg",
        user_agent = "reddit scraper"
    )
    return reddit

reddit = connect_to_reddit()

In [33]:
print(reddit.read_only)

True


In [34]:
r_atheism = reddit.subreddit("atheism")

In [None]:
a_submission = r_atheism.top(time_filter = "all", limit = 10)

In [None]:
def get_post_comment_data(reddit_submission):
    posts_data = []
    comments_data = []
    for submission in reddit_submission:
        posts_data.append({
            "post_id": submission.id,
            "post_title": submission.title,
            "post_author": submission.author.name if submission.author else "[deleted]",
            "post_score": submission.score,
            "post_url": submission.url,
            "post_text": submission.selftext,
            "num_comments": submission.num_comments
        })
    
        submission.comments.replace_more(limit = None)
        for comment in submission.comments.list():
            if isinstance(comment, praw.models.Comment):
                comments_data.append({
                    "comment_id": comment.id,
                    "post_id": submission.id,
                    "post_title": submission.title, 
                    "comment_author": comment.author.name if comment.author else "[deleted]",
                    "comment_body": comment.body,
                    "comment_score": comment.score,
                    "comment_parent_id": comment.parent_id
                })
    posts_df = pd.DataFrame(posts_data)
    comments_df = pd.DataFrame(comments_data)

    return posts_df, comments_df

In [None]:
a_posts_df, a_comments_df = get_post_comment_data(a_submission)

a_posts_df.name = "Atheism Posts DF"
a_comments_df.name = "Atheism Comments DF"


## Exploratory Data Analysis

In [None]:
# head of data set
a_posts_df.head()

In [None]:
a_posts_df.info()

In [None]:
a_comments_df.head()

In [None]:
a_comments_df.info()

In [None]:
def create_edadf(*dataframes):
    eda_df = []
    for df in dataframes:
        df.replace(r'^\s*$', np.nan, regex=True)
        eda_df.append({
            "name": df.name,
            "num_rows": df.shape[0],
            "num_cols": df.shape[1],
            "contains_null": df.isnull().any(axis = None)
        })
    return pd.DataFrame(eda_df)
    
p_eda_df = create_edadf(a_posts_df)
c_eda_df = create_edadf(a_comments_df)

In [None]:
p_eda_df

In [None]:
c_eda_df

In [None]:
a_posts_df.info()

In [None]:
a_comments_df.info()

In [None]:
#dummy set until i get api working
post_title = ["My parents just told me I’m going through a phase",
            "Forced to attend a 3-hour Easter service. How do I survive?",
             " Just realized I don't fear death anymore. It’s actually peaceful.",
             "Coming out as an atheist in the Bible Belt is basically social suicide.",
             "Dating as an atheist is a minefield.",
             " Anyone else get 'The Look' when they say they don't pray?",
             "My kid asked why the neighbors go to the big stone house on Sundays",
             "I’m tired of being told I have no moral compass.",
             "Religious wellness influencers are driving me insane.",
             "How do you handle a religious funeral for someone who was an atheist?",
             "The Problem of Evil is still the undefeated champion of arguments.",
             "Why is 'I don't know' so scary to people?",
             "Pascal’s Wager is a logical disaster.",
             "Divine Hiddenness: If God wanted me to believe, He’d know exactly what evidence would convince me.",
             "The 'Fine-Tuning' argument is just survivorship bias on a cosmic scale.",
             "If you were born in a different country, you’d have a different one true god.",
             "Burdens of proof: Why do they always try to flip it?",
             "Biblical prophecies are just vague enough to fit anything.",
             "Why is the First Cause always a sentient Being?",
             "Subjective experiences aren't evidence.",
             "Tax the churches already.",
             "'Under God' in the Pledge of Allegiance still irritates me.",
             "Why is it freedom of religion but never freedom FROM religion?",
             "The indoctrination of children should be considered a form of abuse.
",
             "Religious exemptions for vaccines are a public health nightmare.",
             "Christian Nationalism is the biggest threat to Western democracy.
",
             "Science flies you to the moon; religion flies you into buildings.
",
             "Why does the media always frame atheism as 'aggressive'?
",
             "The hypocrisy of 'pro-life' groups is staggering.",
             "",
             "",
             ""]

dummy_data = {
            "post_id": np.arange(1,101,1)
            "post_title": submission.title,
            "post_author": submission.author.name if submission.author else "[deleted]",
            "post_score": submission.score,
            "post_url": submission.url,
            "post_text": submission.selftext,
            "num_comments": submission.num_comments
}

## Preprocess text

In [None]:
def preprocess_df(df, type = "post_title"):
    #CLEAN TEXT
    processed_texts = []
    # for each text in df.post_title:
    for sentence in df[type]:
        #remove emojis
        df[type] = df[type].apply(lambda s: emoji.replace_emoji(s, ''))    
        #remove new lines
        sentence = re.sub(r'\\n|:|_',' ', sentence)
        #remove punctuations
        sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence)
        #convert all text to lower case
        sentence = sentence.lower()
        #remove stop words
        tokens = word_tokenize(sentence)
        filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
        
        #lemmatize tokens
        lmtzr = WordNetLemmatizer()
        l_words = [lmtzr.lemmatize(word) for word in filtered_tokens]

        processed_texts.append(' '.join(l_words))
        #return c_df
    df[type] = processed_texts
    return df

In [None]:
test = preprocess_df(a_comments_df, type = "comment_body")

In [None]:
test_c = test[test["comment_body"] != 'deleted']["comment_body"]
test_c

# Feature Extraction

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(test_c)

In [None]:
X.shape

In [None]:
X_array = X.toarray()
feature_names = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(X_array, columns = feature_names)
df_tfidf[df_tfidf['00'] == 1]