In [1]:
import os
import re
import nltk
import spacy
import random
import neuralcoref
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Preparation

In [2]:
wnl = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [3]:
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fa4dfae9ee0>

# The SMMRY Algorithm

In [4]:
transition_phrases = ['thus', 'for example', 'for instance', 'namely', 'to illustrate',
                      'in other words', 'in particular', 'specifically', 'such as',
                      'on the contrary', 'contrarily', 'notwithstanding', 'but', 'however',
                      'nevertheless', 'in spite of', 'in contrast', 'yet', 'on one hand',
                      'on the other hand', 'rather', 'or', 'nor', 'conversely', 'at the same time',
                      'while this may be true', 'and', 'in addition to', 'furthermore',
                      'moreover', 'besides', 'than', 'too', 'also', 'both-and', 'another',
                      'equally important', 'second', 'etc.', 'again', 'further', 'last',
                      'finally', 'not only-but also', 'as well as', 'in the second place',
                      'next', 'likewise', 'similarly', 'in fact', 'as a result', 'consequently',
                      'in the same way', 'for example', 'for instance', 'however', 'thus',
                      'therefore', 'otherwise', 'after that', 'afterward', 'then', 'next',
                      'last', 'at last', 'at length', 'at first', 'formerly', 'another', 'finally',
                      'meanwhile', 'at the same time', 'afterwards', 'subsequently',
                      'in the meantime', 'eventually', 'concurrently', 'simultaneously', 'although',
                      'at least', 'still', 'even though', 'granted that', 'while it may be true',
                      'in spite of', 'of course', 'similarly', 'likewise', 'in like fashion',
                      'in like manner', 'analogous to', 'above all', 'indeed', 'of course',
                      'certainly', 'surely', 'in fact', 'really', 'in truth', 'again', 'besides',
                      'also', 'furthermore', 'in addition', 'specifically', 'especially',
                      'in particular', 'to explain', 'to list', 'to enumerate', 'in detail',
                      'namely', 'including', 'for example', 'for instance', 'to illustrate',
                      'thus', 'in other words', 'as an illustration', 'in particular', 'so that',
                      'with the result that', 'consequently', 'hence', 'accordingly', 'for this reason',
                      'therefore', 'because', 'due to', 'as a result', 'in other words', 'then',
                      'therefore', 'finally', 'consequently', 'thus', 'in conclusion', 'as a result',
                      'accordingly', 'for this purpose', 'to this end', 'with this in mind',
                      'with this purpose in mind', 'therefore']

In [5]:
def transition_start(first_sent, dialog_turn):
    if dialog_turn == 1:
        for phrase in transition_phrases:
            if first_sent.lower().startswith(phrase):
                return True
        return False
    else:
        return False

In [6]:
def smmry(text, doc, sent_count, dialog_turn):

    # some preprocessing to omit text within brackets and replace u with you. 
    
    # text = re.sub("[\(\[].*?[\)\]]", "", text)
    # text = text.replace(' u ', ' you ')

    formatted_text = re.sub('[^a-zA-Z]', ' ', text)
    formatted_text = re.sub(r'\s+', ' ', formatted_text)

    # doc = nlp(text)

    fdist = {}
    word_arr = nltk.word_tokenize(formatted_text.lower())

    # preparing a frequency dictionary without considering stop words
    
    for word in word_arr:
        if not word in stop_words:
            word = wnl.lemmatize(word)
            if word not in fdist.keys():
                    fdist[word] = 1
            else:
                    fdist[word] += 1

    sent_arr = nltk.sent_tokenize(text)
    sent_score_arr = []
    summary_arr = []

    sent_arr_coref_resolved = nltk.sent_tokenize(doc._.coref_resolved)

    # compute scores for each sentence

    for sent in sent_arr:
        score = 0
        token_arr = nltk.word_tokenize(sent.lower())
        for word in token_arr:
            word = wnl.lemmatize(word)
            if word in fdist.keys():
                score += fdist[word]

        sent_score_arr.append(score/len(token_arr))

    sent_score_arr = np.array(sent_score_arr)

    all_ind_arr = sent_score_arr.argsort()[-len(sent_score_arr):][::-1]

    ind_arr_unsorted = sent_score_arr.argsort()[-sent_count:][::-1]

    ind_arr = np.sort(ind_arr_unsorted) 

    summary = ''
    changed_first = False

    if len(ind_arr) > 0:

        try:

            ind = ind_arr[0]
            first_sent = sent_arr[ind]

            while (first_sent != sent_arr_coref_resolved[ind] or transition_start(first_sent, dialog_turn)):
                changed_first = True
                for index in all_ind_arr:
                    if index < ind:
                        ind = index
                        break
                first_sent = sent_arr[ind]
                if ind == 0:
                    break
            summary = summary + first_sent + ' '     
            
            if (changed_first):
                first_ind = ind
                sent_score_modified = sent_score_arr[first_ind+1:]
                ind_arr_unsorted = sent_score_modified.argsort()[-(sent_count-1):][::-1]
                ind_arr_next = np.sort(ind_arr_unsorted) 
                
                for i in range(0, len(ind_arr_next)):
                    ind = (first_ind+1) + ind_arr_next[i]
                    if i == len(ind_arr_next) - 1:
                        summary = summary + sent_arr[ind]
                    else:
                        summary = summary + sent_arr[ind] + ' '
            
            else:
                for i in range(1, len(ind_arr)):
                    ind = ind_arr[i]
                    if i == len(ind_arr) - 1:
                        summary = summary + sent_arr[ind]
                    else:
                        summary = summary + sent_arr[ind] + ' '

            return summary

        except Exception as e:

            print("EXCEPTION occured")
            return text

    else:
        print(text)
        print(sent_arr)
        print("EXCEPTION occured: length of sentence array is not > 0")
        return text

# Data Cleaning Functions

In [7]:
# aux_verbs = ['be', 'can', 'could', 'dare', 'do', 'have', 'may', 'might', 'must',
#              'need', 'ought', 'shall', 'should', 'will', 'would']
# wh_words = ['what', 'when', 'where', 'which', 'who', 'whom', 'whose', 'why', 'how']
# q_words = aux_verbs + wh_words

In [8]:
def preprocess_raw(text):
    # Check if text is a str
    if not isinstance(text, str):
        return None

    # Replace HTML escape chars
    text = text.replace('&gt;', '>')
    text = text.replace('&lt;', '<')
    text = text.replace('&amp;', '&')
    text = text.replace('#x200B;', ' ')
    text = text.replace('nbsp;', ' ')

    # Remove brackets
    b_pattern = re.compile(r'(\([^\(\)]*\))|(\[[^\[\]]*\])')
    while b_pattern.search(text):
        text = re.sub(r'(\([^\(\)]*\))|(\[[^\[\]]*\])', '', text)

    # Remove redundant spaces (including breaklines)
    text = ' '.join(text.split())

    # Check if text is empty
    if not text:
        return None

    text_lower = text.lower()

    # Check if text is [deleted] or [removed]
    if text_lower == '[deleted]' or text_lower == '[removed]':
        return None

    # Check if text contains URL
    url_pattern = re.compile(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    if url_pattern.search(text_lower):
        return None

    # Check if text contains 'r/<subreddit>' or 'u/<username>'
    r_pattern = re.compile(r'(^| )\/?r\/[^ ]*')
    if r_pattern.search(text_lower):
        return None
    u_pattern = re.compile(r'(^| )\/?u\/[^ ]*')
    if u_pattern.search(text_lower):
        return None

    # Check if text contains 'reddit'
    if 'reddit' in text_lower:
        return None

    # Check the percentage of alphabetical letters
    num_alphas = 0
    for ch in text:
        if ch.isalpha():
            num_alphas += 1
    if num_alphas / len(text) < 0.7:
        return None

    doc = nlp(text)

    # Check the number of tokens
    if len(doc) < 2:
        return None

    return {'text': text, 'doc': doc}

In [9]:
def preprocess_summary(text):
    # Check if text is a str
    if not isinstance(text, str):
        return None

    # Remove redundant spaces (including breaklines)
    text = ' '.join(text.split())

    # Check if text is empty
    if not text:
        return None

    # Check the percentage of alphabetical letters
    num_alphas = 0
    for ch in text:
        if ch.isalpha():
            num_alphas += 1
    if num_alphas / len(text) < 0.7:
        return None

    doc = nlp(text)

    # Check the number of tokens
    if len(doc) < 2:
        return None

    return {'text': text, 'doc': doc}

In [10]:
def extract_root(text, sent):
#     # Check if the last character is a question mark
#     if sent[-1].text == '?':
#         return None

    if sent.root.pos_ == 'VERB':
#         # Check the first token
#         if sent[0].lemma_.lower() in q_words:
#             return None
        return sent.root.lemma_

    return None

In [11]:
def summarize(preprocessed_text, dialog_turn):
    if preprocessed_text is None:
        return None

    text = preprocessed_text['text']
    doc = preprocessed_text['doc']

    summarized = 0
    sents = [sent for sent in doc.sents]
    if len(sents) > 1:
        summarized = 1
        summary = smmry(text, doc, 1, dialog_turn)
        preprocessed_summary = preprocess_summary(summary)
        if preprocessed_summary is None:
            return None
        summarized_text = preprocessed_summary['text']
        summarized_doc = preprocessed_summary['doc']
        summarized_sents = [sent for sent in summarized_doc.sents]
        if len(summarized_sents) != 1:
            return None
    elif len(sents) == 1:
        summarized_text = text
        summarized_doc = doc
        summarized_sents = sents
    else:
        return None

    if dialog_turn > 1:
        return {'text': summarized_text, 'summarized': summarized, 'length': len(summarized_sents[0])}

    root = extract_root(summarized_text, summarized_sents[0])
    if root is not None:
        return {'text': summarized_text, 'summarized': summarized, 'root': root, 'length': len(summarized_sents[0])}
    else:
        return None

# Filter Submissions

In [34]:
date_suffix = '20200101_20201231'

In [35]:
submission_df = pd.read_csv('../data/reddit/raw/casual_conv_submissions_{}.csv'.format(date_suffix))
submission_df.shape

(97279, 10)

In [36]:
submission_df.head()

Unnamed: 0,id,created_utc,title,selftext,num_comments,subreddit,subreddit_id,score,author_fullname,link_flair_text
0,ko0y56,1609458941,Fireworks is banned for this newyear here and ...,[removed],0,CasualConversation,t5_323oy,1,t2_4l6jcegg,
1,ko0xfy,1609458866,HAPPY 2021 from Italy!!!,"To those that had it rough,\nTo the ones that ...",17,CasualConversation,t5_323oy,1,t2_5quydde8,:chat: Just Chatting
2,ko0ue3,1609458544,Positivity Blog/Community,[removed],2,CasualConversation,t5_323oy,1,t2_2t9bipf6,
3,ko0uby,1609458537,Can i get some last minute advice?,I know mental health is apparently against the...,13,CasualConversation,t5_323oy,1,t2_7qyyey38,:question: Questions
4,ko0tyb,1609458499,New Year,Hello everyone!\nJust wanted to wish everyone ...,1,CasualConversation,t5_323oy,1,t2_43cjc0iu,:chat: Just Chatting


In [37]:
submission_filtered_cols = ['id', 'summarized', 'from', 'text', 'root', 'length']
submission_filtered_dict = {col: [] for col in submission_filtered_cols}

In [38]:
for i in tqdm(range(submission_df.shape[0])):
    submission_id = submission_df.iloc[i]['id']
    title = submission_df.iloc[i]['title']
    preprocessed_title = preprocess_raw(title)
    summarized_title = summarize(preprocessed_title, dialog_turn = 1)
    if summarized_title is not None:
        submission_filtered_dict['id'].append(submission_id)
        submission_filtered_dict['summarized'].append(summarized_title['summarized'])
        submission_filtered_dict['from'].append('title')
        submission_filtered_dict['text'].append(summarized_title['text'])
        submission_filtered_dict['root'].append(summarized_title['root'])
        submission_filtered_dict['length'].append(summarized_title['length'])
    else:
        selftext = submission_df.iloc[i]['selftext']
        preprocessed_selftext = preprocess_raw(selftext)
        summarized_selftext = summarize(preprocessed_selftext, dialog_turn = 1)
        if summarized_selftext is not None:
            submission_filtered_dict['id'].append(submission_id)
            submission_filtered_dict['summarized'].append(summarized_selftext['summarized'])
            submission_filtered_dict['from'].append('selftext')
            submission_filtered_dict['text'].append(summarized_selftext['text'])
            submission_filtered_dict['root'].append(summarized_selftext['root'])
            submission_filtered_dict['length'].append(summarized_selftext['length'])

  8%|████████                                                                                                    | 7296/97279 [03:00<23:53, 62.78it/s]

EXCEPTION occured


 11%|███████████▌                                                                                               | 10515/97279 [04:17<22:04, 65.52it/s]

EXCEPTION occured


 18%|██████████████████▉                                                                                        | 17161/97279 [07:03<34:06, 39.14it/s]

EXCEPTION occured


 21%|█████████████████████▉                                                                                     | 19950/97279 [08:19<30:33, 42.17it/s]

EXCEPTION occured


 26%|███████████████████████████▊                                                                               | 25328/97279 [10:38<38:29, 31.15it/s]

EXCEPTION occured


 32%|██████████████████████████████████                                                                         | 30960/97279 [13:00<19:27, 56.82it/s]

EXCEPTION occured


 34%|███████████████████████████████████▌                                                                     | 32908/97279 [13:47<1:07:58, 15.78it/s]

EXCEPTION occured


 34%|████████████████████████████████████▊                                                                      | 33489/97279 [14:03<21:42, 48.97it/s]

EXCEPTION occured


 36%|██████████████████████████████████████▋                                                                    | 35134/97279 [14:49<19:10, 54.01it/s]

EXCEPTION occured


 45%|███████████████████████████████████████████████▌                                                         | 44082/97279 [18:21<1:03:18, 14.01it/s]

EXCEPTION occured


 64%|████████████████████████████████████████████████████████████████████▌                                      | 62281/97279 [26:09<26:49, 21.75it/s]

EXCEPTION occured


 75%|████████████████████████████████████████████████████████████████████████████████▍                          | 73147/97279 [30:44<08:10, 49.21it/s]

EXCEPTION occured


 77%|██████████████████████████████████████████████████████████████████████████████████▌                        | 75009/97279 [31:38<11:39, 31.86it/s]

EXCEPTION occured


 79%|████████████████████████████████████████████████████████████████████████████████████▉                      | 77265/97279 [32:38<05:40, 58.72it/s]

EXCEPTION occured


 91%|█████████████████████████████████████████████████████████████████████████████████████████████████▎         | 88514/97279 [37:52<04:20, 33.61it/s]

EXCEPTION occured


 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 96721/97279 [41:33<00:09, 60.34it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 97279/97279 [41:46<00:00, 38.82it/s]


In [39]:
submission_filtered_df = pd.DataFrame(submission_filtered_dict)

In [40]:
submission_filtered_df

Unnamed: 0,id,summarized,from,text,root,length
0,ko0xfy,0,selftext,"To those that had it rough, To the ones that b...",wish,41
1,ko0sa9,0,title,Assassins Creed rekindled,rekindle,3
2,ko0nsh,0,title,What's everyone drinking tonight?,drink,6
3,ko0mjr,1,selftext,I also hope everyone achieves a writhing you p...,hope,10
4,ko0j1p,0,title,I Stand Against All Domestic Abuse,stand,6
...,...,...,...,...,...,...
60256,eibhr2,1,selftext,Here take a hug.,take,5
60257,eibeib,0,title,If you need an excuse to separate from the cro...,use,29
60258,eibbz3,0,title,You know when your dogs comfortable with you w...,know,17
60259,eib70r,0,title,In the month of December alone i got over 2000...,get,13


In [41]:
submission_filtered_df.to_csv('../data/reddit/filtered_q/casual_conv_submissions_{}.csv'.format(date_suffix), index = False)

## Batch Process

In [42]:
date_suffices = ['20190101_20191231', '20180101_20181231', '20170101_20171231',
                 '20160101_20161231', '20150101_20151231', '20140608_20141231']
for date_suffix in date_suffices:
    submission_df = pd.read_csv('../data/reddit/raw/casual_conv_submissions_{}.csv'.format(date_suffix))
    print(submission_df.shape)
    submission_filtered_cols = ['id', 'summarized', 'from', 'text', 'root', 'length']
    submission_filtered_dict = {col: [] for col in submission_filtered_cols}
    for i in tqdm(range(submission_df.shape[0])):
        submission_id = submission_df.iloc[i]['id']
        title = submission_df.iloc[i]['title']
        preprocessed_title = preprocess_raw(title)
        summarized_title = summarize(preprocessed_title, dialog_turn = 1)
        if summarized_title is not None:
            submission_filtered_dict['id'].append(submission_id)
            submission_filtered_dict['summarized'].append(summarized_title['summarized'])
            submission_filtered_dict['from'].append('title')
            submission_filtered_dict['text'].append(summarized_title['text'])
            submission_filtered_dict['root'].append(summarized_title['root'])
            submission_filtered_dict['length'].append(summarized_title['length'])
        else:
            selftext = submission_df.iloc[i]['selftext']
            preprocessed_selftext = preprocess_raw(selftext)
            summarized_selftext = summarize(preprocessed_selftext, dialog_turn = 1)
            if summarized_selftext is not None:
                submission_filtered_dict['id'].append(submission_id)
                submission_filtered_dict['summarized'].append(summarized_selftext['summarized'])
                submission_filtered_dict['from'].append('selftext')
                submission_filtered_dict['text'].append(summarized_selftext['text'])
                submission_filtered_dict['root'].append(summarized_selftext['root'])
                submission_filtered_dict['length'].append(summarized_selftext['length'])
    submission_filtered_df = pd.DataFrame(submission_filtered_dict)
    print(submission_filtered_df.shape)
    submission_filtered_df.to_csv('../data/reddit/filtered_q/casual_conv_submissions_{}.csv'.format(date_suffix), index = False)

(75381, 10)


 13%|██████████████                                                                                              | 9782/75381 [04:20<29:34, 36.97it/s]

EXCEPTION occured


 27%|████████████████████████████▊                                                                              | 20288/75381 [09:25<30:13, 30.38it/s]

EXCEPTION occured


 33%|███████████████████████████████████                                                                        | 24702/75381 [11:28<24:24, 34.61it/s]

EXCEPTION occured


 35%|█████████████████████████████████████▍                                                                     | 26379/75381 [12:15<13:39, 59.77it/s]

EXCEPTION occured


 38%|████████████████████████████████████████▊                                                                  | 28762/75381 [13:16<34:55, 22.25it/s]

EXCEPTION occured


 53%|█████████████████████████████████████████████████████████▏                                                 | 40291/75381 [18:53<15:18, 38.22it/s]

EXCEPTION occured


 55%|██████████████████████████████████████████████████████████▋                                                | 41329/75381 [19:23<11:14, 50.49it/s]

EXCEPTION occured


 73%|█████████████████████████████████████████████████████████████████████████████▋                             | 54715/75381 [25:44<07:13, 47.72it/s]

EXCEPTION occured


 76%|████████████████████████████████████████████████████████████████████████████████▉                          | 57016/75381 [26:48<07:33, 40.49it/s]

EXCEPTION occured


 84%|█████████████████████████████████████████████████████████████████████████████████████████▊                 | 63229/75381 [29:39<03:10, 63.82it/s]

EXCEPTION occured


 93%|███████████████████████████████████████████████████████████████████████████████████████████████████▋       | 70207/75381 [32:46<01:50, 46.88it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 75381/75381 [34:59<00:00, 35.90it/s]


(47595, 6)


  submission_df = pd.read_csv('../data/reddit/raw/casual_conv_submissions_{}.csv'.format(date_suffix))


(76633, 10)


  2%|██▌                                                                                                         | 1783/76633 [00:45<28:04, 44.43it/s]

EXCEPTION occured


 29%|███████████████████████████████▍                                                                           | 22489/76633 [10:02<25:07, 35.91it/s]

EXCEPTION occured


 34%|███████████████████████████████████▉                                                                       | 25713/76633 [11:36<25:57, 32.70it/s]

EXCEPTION occured


 39%|██████████████████████████████████████████                                                                 | 30086/76633 [13:38<12:20, 62.88it/s]

EXCEPTION occured


 42%|█████████████████████████████████████████████▍                                                             | 32537/76633 [14:41<17:23, 42.25it/s]

EXCEPTION occured


 45%|████████████████████████████████████████████████                                                           | 34378/76633 [15:30<18:05, 38.91it/s]

EXCEPTION occured


 45%|████████████████████████████████████████████████▎                                                          | 34604/76633 [15:36<17:51, 39.22it/s]

EXCEPTION occured


 47%|██████████████████████████████████████████████████▌                                                        | 36251/76633 [16:18<09:23, 71.68it/s]

EXCEPTION occured


 60%|████████████████████████████████████████████████████████████████▍                                          | 46151/76633 [20:34<21:55, 23.17it/s]

EXCEPTION occured


 74%|██████████████████████████████████████████████████████████████████████████████▊                            | 56463/76633 [25:06<08:16, 40.66it/s]

EXCEPTION occured


 78%|███████████████████████████████████████████████████████████████████████████████████                        | 59510/76633 [26:14<06:53, 41.45it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 76633/76633 [32:10<00:00, 39.69it/s]


(47640, 6)
(55543, 10)


 16%|█████████████████▍                                                                                          | 8955/55543 [03:02<23:56, 32.43it/s]

EXCEPTION occured


 61%|████████████████████████████████████████████████████████████████▊                                          | 33665/55543 [10:17<04:00, 90.82it/s]

EXCEPTION occured


 87%|█████████████████████████████████████████████████████████████████████████████████████████████▎             | 48459/55543 [14:54<02:54, 40.70it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 55543/55543 [17:06<00:00, 54.12it/s]


(34064, 6)
(35681, 10)


 62%|█████████████████████████████████████████████████████████████████▉                                         | 21974/35681 [05:50<04:16, 53.42it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 35681/35681 [09:28<00:00, 62.82it/s]


(20968, 6)
(39868, 10)


 17%|██████████████████▌                                                                                         | 6862/39868 [01:57<12:12, 45.06it/s]

EXCEPTION occured


 51%|██████████████████████████████████████████████████████▌                                                    | 20339/39868 [06:25<04:59, 65.31it/s]

EXCEPTION occured


 51%|██████████████████████████████████████████████████████▉                                                    | 20482/39868 [06:27<03:58, 81.13it/s]

EXCEPTION occured


 61%|█████████████████████████████████████████████████████████████████                                          | 24241/39868 [07:43<04:43, 55.20it/s]

EXCEPTION occured


 65%|█████████████████████████████████████████████████████████████████████▎                                     | 25824/39868 [08:12<03:45, 62.20it/s]

EXCEPTION occured


 68%|█████████████████████████████████████████████████████████████████████████▏                                 | 27251/39868 [08:42<06:06, 34.42it/s]

EXCEPTION occured


 87%|████████████████████████████████████████████████████████████████████████████████████████████▋              | 34525/39868 [11:15<02:12, 40.29it/s]

EXCEPTION occured


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 37744/39868 [12:29<00:38, 55.83it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 39868/39868 [13:18<00:00, 49.90it/s]


(25014, 6)
(16949, 10)


 11%|███████████▌                                                                                                | 1815/16949 [00:41<04:40, 53.90it/s]

EXCEPTION occured


 18%|███████████████████                                                                                         | 2986/16949 [01:07<06:37, 35.13it/s]

EXCEPTION occured


 71%|███████████████████████████████████████████████████████████████████████████▋                               | 11995/16949 [04:56<03:08, 26.25it/s]

EXCEPTION occured


 73%|██████████████████████████████████████████████████████████████████████████████▎                            | 12409/16949 [05:06<02:16, 33.18it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 16949/16949 [06:48<00:00, 41.46it/s]

(10781, 6)





## Check Questions

In [32]:
questions = []
for i in tqdm(range(submission_filtered_df.shape[0])):
    if submission_filtered_df.iloc[i]['text'].endswith('?'):
        questions.append(submission_filtered_df.iloc[i]['text'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 8470/8470 [00:00<00:00, 9562.37it/s]


In [33]:
questions[:10]

['How do you deal with not having family or any close relationships?',
 'Why do I intuitively find this surprising?',
 'How will lack of romantic relationship in your teen and young adult years affect you in the long run?',
 'How will lack of romantic relationship in your teen and young adult years affect you in the long run?',
 'What would teenage you think about who you are now?',
 "Let's chat?",
 'Why do Americans think adults living with parents is bad?',
 "Do you guys feel like life's not that much fun anymore?",
 "Why do schools teach things you don't need to know like algebra?",
 "How can I make sure my mom doesn't feel lonely?"]

# Filter Comments

In [29]:
comment_df = pd.read_csv('../data/reddit/raw/casual_conv_comments_{}.csv'.format(date_suffix))
comment_df.shape

(311185, 9)

In [30]:
comment_df.head()

Unnamed: 0,id,link_id,parent_id,created_utc,body,subreddit,subreddit_id,score,author_fullname
0,hm5iqwj,t3_r2nmvy,t3_r2nmvy,1637933885,"Yeah, they’re fun! Conscious Club is campy in ...",CasualConversation,t5_323oy,1.0,t2_5yhjayls
1,hm5iup9,t3_r2nlu7,t3_r2nlu7,1637933947,"Sounds like fate, but also disgusting. I hope ...",CasualConversation,t5_323oy,62.0,t2_g7sdcvpm
2,hm5iuth,t3_r2nmvy,t1_hm5iqwj,1637933949,I know!,CasualConversation,t5_323oy,1.0,t2_dvd2sbpi
3,hm5iw7v,t3_r2nlu7,t3_r2nlu7,1637933973,That is an amazing story and even bigger coinc...,CasualConversation,t5_323oy,18.0,t2_xeg58lg
4,hm5j61n,t3_r2nlu7,t1_hm5iup9,1637934133,He’s a doctor now so I hope he’s grown out of ...,CasualConversation,t5_323oy,66.0,t2_5t1dsc17


In [31]:
submission_filtered_df = pd.read_csv('../data/reddit/filtered/casual_conv_submissions_{}.csv'.format(date_suffix))
submission_filtered_ids = submission_filtered_df['id'].tolist()
submission_filtered_ids = ['t3_' + x for x in submission_filtered_ids]

In [32]:
comment_df = comment_df[comment_df['parent_id'].isin(submission_filtered_ids)]
comment_df.shape

(85598, 9)

In [33]:
comment_filtered_cols = ['id', 'parent_id', 'summarized', 'text', 'length']
comment_filtered_dict = {col: [] for col in comment_filtered_cols}

In [34]:
for i in tqdm(range(comment_df.shape[0])):
    comment_id = comment_df.iloc[i]['id']
    parent_id = comment_df.iloc[i]['parent_id']
    body = comment_df.iloc[i]['body']
    preprocessed_body = preprocess_raw(body)
    summarized_body = summarize(preprocessed_body, dialog_turn = 2)
    if summarized_body is not None:
        comment_filtered_dict['id'].append(comment_id)
        comment_filtered_dict['parent_id'].append(parent_id)
        comment_filtered_dict['summarized'].append(summarized_body['summarized'])
        comment_filtered_dict['text'].append(summarized_body['text'])
        comment_filtered_dict['length'].append(summarized_body['length'])

  2%|██▌                                                                                                         | 2051/85598 [00:39<19:59, 69.65it/s]

EXCEPTION occured


 10%|██████████▉                                                                                               | 8871/85598 [03:10<1:39:55, 12.80it/s]

EXCEPTION occured


 12%|█████████████                                                                                              | 10498/85598 [04:05<25:35, 48.89it/s]

EXCEPTION occured


 18%|███████████████████▏                                                                                       | 15392/85598 [05:47<22:41, 51.58it/s]

EXCEPTION occured


 21%|██████████████████████▋                                                                                    | 18192/85598 [06:53<21:34, 52.07it/s]

EXCEPTION occured


 25%|██████████████████████████▉                                                                                | 21590/85598 [07:55<26:06, 40.85it/s]

EXCEPTION occured


 41%|███████████████████████████████████████████▍                                                               | 34740/85598 [12:37<31:27, 26.94it/s]

EXCEPTION occured


 48%|███████████████████████████████████████████████████▌                                                       | 41224/85598 [15:03<16:53, 43.76it/s]

EXCEPTION occured


 51%|██████████████████████████████████████████████████████▌                                                    | 43652/85598 [16:03<14:02, 49.81it/s]

EXCEPTION occured


 55%|██████████████████████████████████████████████████████████▋                                                | 46939/85598 [17:14<22:37, 28.48it/s]

EXCEPTION occured


 77%|██████████████████████████████████████████████████████████████████████████████████▋                        | 66111/85598 [24:19<06:49, 47.62it/s]

EXCEPTION occured


 81%|██████████████████████████████████████████████████████████████████████████████████████▏                    | 68914/85598 [25:24<07:33, 36.79it/s]

EXCEPTION occured


 82%|███████████████████████████████████████████████████████████████████████████████████████▍                   | 69986/85598 [25:55<09:54, 26.24it/s]

EXCEPTION occured


 88%|█████████████████████████████████████████████████████████████████████████████████████████████▊             | 75077/85598 [28:21<05:32, 31.67it/s]

EXCEPTION occured


 88%|██████████████████████████████████████████████████████████████████████████████████████████████▌            | 75680/85598 [28:39<05:12, 31.78it/s]

EXCEPTION occured


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████    | 82410/85598 [31:08<01:09, 46.06it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 85598/85598 [32:13<00:00, 44.27it/s]


In [35]:
comment_filtered_df = pd.DataFrame(comment_filtered_dict)

In [36]:
comment_filtered_df

Unnamed: 0,id,parent_id,summarized,text,length
0,hm5jftd,t3_r2npa3,0,I take one to two hours a day where I listen t...,24
1,hm5kl30,t3_r2npa3,0,Walking in the countryside withan audiobook.,7
2,hm5klmc,t3_r2nz4v,0,I look outside my window and ask myself this a...,13
3,hm5l7rv,t3_r2npa3,1,I completed a course in Transcendental Meditat...,22
4,hm5q97a,t3_r2nz4v,1,Movies and Video games require thousands of pe...,14
...,...,...,...,...,...
53689,gxn9pkb,t3_n8se6e,1,Good luck!,3
53690,gxnbhpi,t3_n8se6e,0,It's really hard to get work right now so well...,14
53691,gxnxjei,t3_n8se6e,1,Glad you found work.,5
53692,gxocvzh,t3_n8sdhl,1,"I've never lived anywhere with fireflies, and ...",18


In [37]:
comment_filtered_df.to_csv('../data/reddit/filtered/casual_conv_comments_{}.csv'.format(date_suffix), index = False)

## Batch Process

In [43]:
date_suffices = ['20210501_20211231', '20210101_20210430', '20200101_20201231',
                 '20190101_20191231', '20180101_20181231', '20170101_20171231',
                 '20160101_20161231']
for date_suffix in date_suffices:
    comment_df = pd.read_csv('../data/reddit/raw/casual_conv_comments_{}.csv'.format(date_suffix))
    print(comment_df.shape)
    submission_filtered_df = pd.read_csv('../data/reddit/filtered_q/casual_conv_submissions_{}.csv'.format(date_suffix))
    submission_filtered_ids = submission_filtered_df['id'].tolist()
    submission_filtered_ids = ['t3_' + x for x in submission_filtered_ids]
    comment_df = comment_df[comment_df['parent_id'].isin(submission_filtered_ids)]
    print(comment_df.shape)
    comment_filtered_cols = ['id', 'parent_id', 'summarized', 'text', 'length']
    comment_filtered_dict = {col: [] for col in comment_filtered_cols}
    for i in tqdm(range(comment_df.shape[0])):
        comment_id = comment_df.iloc[i]['id']
        parent_id = comment_df.iloc[i]['parent_id']
        body = comment_df.iloc[i]['body']
        preprocessed_body = preprocess_raw(body)
        summarized_body = summarize(preprocessed_body, dialog_turn = 2)
        if summarized_body is not None:
            comment_filtered_dict['id'].append(comment_id)
            comment_filtered_dict['parent_id'].append(parent_id)
            comment_filtered_dict['summarized'].append(summarized_body['summarized'])
            comment_filtered_dict['text'].append(summarized_body['text'])
            comment_filtered_dict['length'].append(summarized_body['length'])
    comment_filtered_df = pd.DataFrame(comment_filtered_dict)
    print(comment_filtered_df)
    comment_filtered_df.to_csv('../data/reddit/filtered_q/casual_conv_comments_{}.csv'.format(date_suffix), index = False)

(311185, 9)
(115319, 9)


  2%|██▍                                                                                                        | 2650/115319 [00:52<28:24, 66.12it/s]

EXCEPTION occured


 10%|██████████▎                                                                                             | 11370/115319 [04:10<2:10:01, 13.32it/s]

EXCEPTION occured


 10%|███████████                                                                                               | 12058/115319 [04:37<34:49, 49.41it/s]

EXCEPTION occured


 12%|████████████▍                                                                                             | 13561/115319 [05:17<29:38, 57.23it/s]

EXCEPTION occured


 18%|███████████████████▏                                                                                      | 20823/115319 [07:56<47:15, 33.33it/s]

EXCEPTION occured


 21%|██████████████████████▍                                                                                   | 24469/115319 [09:20<26:54, 56.27it/s]

EXCEPTION occured


 25%|██████████████████████████                                                                                | 28388/115319 [10:31<27:24, 52.87it/s]

EXCEPTION occured


 38%|████████████████████████████████████████▌                                                                 | 44163/115319 [15:16<43:42, 27.13it/s]

EXCEPTION occured


 45%|███████████████████████████████████████████████▉                                                          | 52169/115319 [18:19<36:17, 29.00it/s]

EXCEPTION occured


 52%|██████████████████████████████████████████████████████▉                                                   | 59708/115319 [21:04<22:48, 40.64it/s]

EXCEPTION occured


 54%|█████████████████████████████████████████████████████████▌                                                | 62654/115319 [22:13<19:33, 44.86it/s]

EXCEPTION occured


 58%|█████████████████████████████████████████████████████████████▏                                            | 66572/115319 [23:36<28:25, 28.57it/s]

EXCEPTION occured


 79%|███████████████████████████████████████████████████████████████████████████████████▎                      | 90653/115319 [32:20<08:09, 50.40it/s]

EXCEPTION occured


 82%|██████████████████████████████████████████████████████████████████████████████████████▍                   | 94025/115319 [33:36<07:06, 49.90it/s]

EXCEPTION occured


 83%|███████████████████████████████████████████████████████████████████████████████████████▌                  | 95198/115319 [34:07<10:36, 31.61it/s]

EXCEPTION occured


 88%|████████████████████████████████████████████████████████████████████████████████████████████             | 101055/115319 [36:46<06:09, 38.56it/s]

EXCEPTION occured


 88%|████████████████████████████████████████████████████████████████████████████████████████████▌            | 101718/115319 [37:04<06:42, 33.78it/s]

EXCEPTION occured


 93%|█████████████████████████████████████████████████████████████████████████████████████████████████▊       | 107364/115319 [39:17<03:34, 37.10it/s]

EXCEPTION occured


 93%|█████████████████████████████████████████████████████████████████████████████████████████████████▊       | 107380/115319 [39:17<03:03, 43.15it/s]

EXCEPTION occured


 96%|████████████████████████████████████████████████████████████████████████████████████████████████████▊    | 110761/115319 [40:28<01:20, 56.47it/s]

EXCEPTION occured


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 115319/115319 [42:05<00:00, 45.67it/s]


            id  parent_id  summarized  \
0      hm5iqwj  t3_r2nmvy           1   
1      hm5jftd  t3_r2npa3           0   
2      hm5kaqv  t3_r2nmvy           0   
3      hm5kddu  t3_r2nylj           0   
4      hm5kig9  t3_r2nylj           1   
...        ...        ...         ...   
73072  gxxue6y  t3_n8vd4o           1   
73073  gxxw0kr  t3_n8vd4o           1   
73074  gxxyb0d  t3_n8vd4o           0   
73075  gxy5eus  t3_n8vd4o           1   
73076  gxyb84x  t3_n8vd4o           1   

                                                    text  length  
0                  Conscious Club is campy in a good way       8  
1      I take one to two hours a day where I listen t...      24  
2                                  I like Scary Pockets.       5  
3       what keeps me calm is my prescription medication       8  
4                         Wow you're amazingly talented.       6  
...                                                  ...     ...  
73072                                 

  7%|███████▌                                                                                                    | 2991/42727 [01:03<10:52, 60.88it/s]

EXCEPTION occured


 38%|████████████████████████████████████████▏                                                                  | 16035/42727 [05:53<13:50, 32.14it/s]

EXCEPTION occured


 49%|███████████████████████████████████████████████████▉                                                       | 20741/42727 [07:43<06:45, 54.22it/s]

EXCEPTION occured


 63%|██████████████████████████████████████████████████████████████████▉                                        | 26748/42727 [10:02<06:39, 40.04it/s]

EXCEPTION occured


 70%|██████████████████████████████████████████████████████████████████████████▉                                | 29939/42727 [11:19<04:32, 46.85it/s]

EXCEPTION occured


 88%|██████████████████████████████████████████████████████████████████████████████████████████████▎            | 37671/42727 [14:17<01:34, 53.28it/s]

EXCEPTION occured


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████      | 40356/42727 [15:13<01:00, 39.44it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 42727/42727 [16:06<00:00, 44.20it/s]


            id  parent_id  summarized  \
0      gwf045x  t3_n1sfco           0   
1      gwf5mh6  t3_n1sfco           0   
2      gwfb4d6  t3_n1tcdm           1   
3      gwfbai3  t3_n1ty7y           1   
4      gwfc9tc  t3_n1udg9           0   
...        ...        ...         ...   
28275  ghsyhye  t3_ko5lhn           0   
28276  ght38a3  t3_ko8gtg           1   
28277  ghtc1uk  t3_ko6ed4           1   
28278  ghu1o5x  t3_ko43wi           1   
28279  ghuln2t  t3_ko6ed4           1   

                                                    text  length  
0      yep I accidentally used my ex toothbrush a cou...      17  
1                                           She be aight       3  
2                 One way or the another it gets leaked.       9  
3      Kind of like how the Internet makes it so easy...      35  
4                  I hear lemon essence or vinegar helps       7  
...                                                  ...     ...  
28275                                 

  1%|▊                                                                                                        | 2266/296980 [00:47<1:42:10, 48.08it/s]

EXCEPTION occured


  1%|█▏                                                                                                       | 3403/296980 [01:10<1:08:26, 71.49it/s]

EXCEPTION occured


  3%|██▋                                                                                                      | 7683/296980 [02:52<3:43:54, 21.53it/s]

EXCEPTION occured


  3%|███▍                                                                                                     | 9647/296980 [03:29<1:49:46, 43.63it/s]

EXCEPTION occured


  3%|███▍                                                                                                     | 9735/296980 [03:31<1:41:54, 46.98it/s]

EXCEPTION occured


  4%|████▏                                                                                                   | 11935/296980 [04:18<2:47:38, 28.34it/s]

EXCEPTION occured


  8%|███████▉                                                                                                | 22830/296980 [08:26<1:35:23, 47.90it/s]

EXCEPTION occured


 10%|██████████▊                                                                                             | 30824/296980 [11:26<1:56:52, 37.95it/s]

EXCEPTION occured


 12%|████████████▏                                                                                           | 34806/296980 [13:02<2:00:48, 36.17it/s]

EXCEPTION occured


 12%|████████████▋                                                                                           | 36398/296980 [13:40<1:36:46, 44.87it/s]

EXCEPTION occured


 15%|███████████████▊                                                                                        | 45063/296980 [16:48<1:20:10, 52.37it/s]

EXCEPTION occured


 16%|████████████████▊                                                                                       | 48166/296980 [17:54<1:13:36, 56.34it/s]

EXCEPTION occured


 17%|█████████████████▉                                                                                      | 51098/296980 [18:57<1:02:17, 65.79it/s]

EXCEPTION occured


 20%|████████████████████▍                                                                                   | 58531/296980 [21:39<2:05:00, 31.79it/s]

EXCEPTION occured


 20%|████████████████████▋                                                                                   | 59217/296980 [21:54<1:59:28, 33.17it/s]

EXCEPTION occured


 21%|█████████████████████▌                                                                                  | 61448/296980 [22:47<1:18:14, 50.18it/s]

EXCEPTION occured


 22%|██████████████████████▉                                                                                 | 65449/296980 [24:03<1:19:28, 48.56it/s]

EXCEPTION occured


 23%|████████████████████████▎                                                                                 | 68077/296980 [24:57<53:59, 70.65it/s]

EXCEPTION occured


 24%|████████████████████████▌                                                                               | 70087/296980 [25:43<1:24:10, 44.92it/s]

EXCEPTION occured


 26%|███████████████████████████▌                                                                            | 78616/296980 [28:32<1:22:41, 44.01it/s]

EXCEPTION occured


 27%|███████████████████████████▋                                                                            | 79202/296980 [28:47<1:23:40, 43.38it/s]

EXCEPTION occured


 29%|██████████████████████████████▎                                                                         | 86639/296980 [31:30<1:47:33, 32.59it/s]

EXCEPTION occured


 30%|███████████████████████████████▍                                                                        | 89738/296980 [32:40<1:34:25, 36.58it/s]

EXCEPTION occured


 32%|█████████████████████████████████▏                                                                      | 94899/296980 [34:27<1:23:14, 40.46it/s]

EXCEPTION occured


 33%|██████████████████████████████████                                                                      | 97114/296980 [35:15<1:04:54, 51.32it/s]

EXCEPTION occured


 33%|███████████████████████████████████                                                                       | 98332/296980 [35:37<55:12, 59.97it/s]

EXCEPTION occured


 33%|███████████████████████████████████▍                                                                      | 99143/296980 [35:54<44:50, 73.54it/s]

EXCEPTION occured


 34%|███████████████████████████████████                                                                    | 101057/296980 [36:36<1:11:04, 45.95it/s]

EXCEPTION occured


 37%|█████████████████████████████████████▉                                                                 | 109563/296980 [39:57<1:05:32, 47.66it/s]

EXCEPTION occured


 40%|█████████████████████████████████████████▉                                                               | 118464/296980 [43:20<48:23, 61.49it/s]

EXCEPTION occured


 41%|███████████████████████████████████████████▍                                                             | 122879/296980 [44:57<39:44, 73.00it/s]

EXCEPTION occured


 42%|████████████████████████████████████████████▌                                                            | 125958/296980 [46:07<59:03, 48.27it/s]

EXCEPTION occured


 46%|████████████████████████████████████████████████▋                                                        | 137662/296980 [49:19<45:00, 59.01it/s]

EXCEPTION occured


 54%|█████████████████████████████████████████████████████████                                                | 161360/296980 [56:31<44:04, 51.29it/s]

EXCEPTION occured


 54%|█████████████████████████████████████████████████████████                                                | 161419/296980 [56:32<40:31, 55.74it/s]

EXCEPTION occured


 57%|██████████████████████████████████████████████████████████▋                                            | 169132/296980 [59:22<1:10:36, 30.18it/s]

EXCEPTION occured
EXCEPTION occured
EXCEPTION occured


 63%|████████████████████████████████████████████████████████████████▋                                      | 186499/296980 [1:05:39<32:35, 56.51it/s]

EXCEPTION occured


 64%|██████████████████████████████████████████████████████████████████▏                                    | 190696/296980 [1:07:13<47:29, 37.30it/s]

EXCEPTION occured


 65%|███████████████████████████████████████████████████████████████████▏                                   | 193890/296980 [1:08:30<35:01, 49.06it/s]

EXCEPTION occured


 68%|█████████████████████████████████████████████████████████████████████▊                                 | 201248/296980 [1:11:14<34:10, 46.68it/s]

EXCEPTION occured


 69%|██████████████████████████████████████████████████████████████████████▌                               | 205280/296980 [1:12:31<13:38, 112.03it/s]

EXCEPTION occured


 78%|████████████████████████████████████████████████████████████████████████████████                       | 230842/296980 [1:21:21<25:56, 42.50it/s]

EXCEPTION occured


 82%|████████████████████████████████████████████████████████████████████████████████████                   | 242440/296980 [1:25:27<16:07, 56.38it/s]

EXCEPTION occured


 82%|████████████████████████████████████████████████████████████████████████████████████▏                  | 242645/296980 [1:25:31<18:15, 49.59it/s]

EXCEPTION occured


 83%|█████████████████████████████████████████████████████████████████████████████████████▏                 | 245527/296980 [1:26:30<13:22, 64.12it/s]

EXCEPTION occured


 83%|█████████████████████████████████████████████████████████████████████████████████████▎                 | 246087/296980 [1:26:40<16:05, 52.71it/s]

EXCEPTION occured


 83%|█████████████████████████████████████████████████████████████████████████████████████▌                 | 246673/296980 [1:26:52<12:18, 68.14it/s]

EXCEPTION occured


 83%|█████████████████████████████████████████████████████████████████████████████████████▊                 | 247322/296980 [1:27:06<23:08, 35.75it/s]

EXCEPTION occured


 84%|██████████████████████████████████████████████████████████████████████████████████████▌                | 249538/296980 [1:27:58<33:24, 23.67it/s]

EXCEPTION occured


 86%|████████████████████████████████████████████████████████████████████████████████████████▊              | 256247/296980 [1:30:15<12:35, 53.90it/s]

EXCEPTION occured


 93%|███████████████████████████████████████████████████████████████████████████████████████████████▌       | 275711/296980 [1:37:19<06:03, 58.46it/s]

EXCEPTION occured


 97%|███████████████████████████████████████████████████████████████████████████████████████████████████▍   | 286839/296980 [1:41:11<03:53, 43.40it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 296980/296980 [1:44:59<00:00, 47.14it/s]


             id  parent_id  summarized  \
0       ghm0f6y  t3_knrh1n           0   
1       ghm5z0i  t3_kns4dy           1   
2       ghm6vjh  t3_kns4dy           0   
3       ghm6z42  t3_knsp11           1   
4       ghm7hon  t3_knsp11           0   
...         ...        ...         ...   
194406  fcpa4r8  t3_eicgrr           1   
194407  fcpbln7  t3_eidode           1   
194408  fcpbudl  t3_eicskc           1   
194409  fcpwter  t3_eid0ht           1   
194410  fcsox0q  t3_eidiyb           0   

                                                     text  length  
0       2020 was the only way that you would be doing ...      19  
1       I personally am excited for the new year becau...      21  
2                 I just wanna travel internationally lol       6  
3                                     Still waiting here.       4  
4                                         Happy new year!       4  
...                                                   ...     ...  
194406  Fuck those 

  1%|▉                                                                                                        | 2099/250599 [00:42<1:22:03, 50.47it/s]

EXCEPTION occured


  2%|█▋                                                                                                         | 4059/250599 [01:17<51:03, 80.48it/s]

EXCEPTION occured


  3%|███▌                                                                                                     | 8540/250599 [03:11<1:52:05, 35.99it/s]

EXCEPTION occured


  4%|███▋                                                                                                     | 8919/250599 [03:24<1:14:27, 54.10it/s]

EXCEPTION occured


  4%|████▍                                                                                                   | 10723/250599 [04:00<1:58:32, 33.73it/s]

EXCEPTION occured


  8%|███████▉                                                                                                | 19033/250599 [06:52<1:05:25, 58.99it/s]

EXCEPTION occured


  8%|████████▍                                                                                               | 20448/250599 [07:28<1:05:00, 59.00it/s]

EXCEPTION occured


  8%|████████▋                                                                                               | 20877/250599 [07:37<1:19:55, 47.91it/s]

EXCEPTION occured


 10%|██████████▊                                                                                             | 25942/250599 [09:28<1:06:01, 56.71it/s]

EXCEPTION occured


 11%|███████████▍                                                                                            | 27431/250599 [09:59<1:01:53, 60.10it/s]

EXCEPTION occured


 15%|███████████████▌                                                                                        | 37422/250599 [13:31<1:13:21, 48.43it/s]

EXCEPTION occured


 16%|████████████████▌                                                                                       | 39964/250599 [14:22<1:10:16, 49.96it/s]

EXCEPTION occured


 17%|█████████████████▌                                                                                      | 42304/250599 [15:06<1:04:19, 53.97it/s]

EXCEPTION occured


 19%|████████████████████▏                                                                                   | 48563/250599 [17:27<1:44:41, 32.17it/s]

EXCEPTION occured


 20%|█████████████████████▏                                                                                    | 50156/250599 [17:59<58:26, 57.16it/s]

EXCEPTION occured


 23%|████████████████████████                                                                                | 57894/250599 [20:52<1:13:58, 43.42it/s]

EXCEPTION occured


 25%|█████████████████████████▉                                                                                | 61463/250599 [22:11<56:55, 55.38it/s]

EXCEPTION occured


 29%|██████████████████████████████▏                                                                         | 72778/250599 [26:20<1:56:15, 25.49it/s]

EXCEPTION occured


 30%|███████████████████████████████▉                                                                          | 75374/250599 [27:17<52:17, 55.86it/s]

EXCEPTION occured


 38%|███████████████████████████████████████▏                                                                | 94559/250599 [34:11<1:03:42, 40.82it/s]

EXCEPTION occured


 40%|██████████████████████████████████████████▏                                                               | 99884/250599 [36:15<55:02, 45.63it/s]

EXCEPTION occured


 47%|█████████████████████████████████████████████████▊                                                       | 118822/250599 [43:38<51:59, 42.25it/s]

EXCEPTION occured


 50%|███████████████████████████████████████████████████▌                                                   | 125492/250599 [45:49<1:27:31, 23.82it/s]

EXCEPTION occured


 54%|████████████████████████████████████████████████████████▍                                                | 134702/250599 [49:04<31:37, 61.09it/s]

EXCEPTION occured


 57%|████████████████████████████████████████████████████████████▏                                            | 143795/250599 [52:35<45:01, 39.54it/s]

EXCEPTION occured


 58%|████████████████████████████████████████████████████████████▉                                            | 145418/250599 [53:26<42:35, 41.17it/s]

EXCEPTION occured


 59%|████████████████████████████████████████████████████████████▎                                          | 146726/250599 [54:06<1:25:54, 20.15it/s]

EXCEPTION occured


 61%|████████████████████████████████████████████████████████████████                                         | 152971/250599 [56:24<29:38, 54.89it/s]

EXCEPTION occured


 64%|██████████████████████████████████████████████████████████████████▊                                      | 159378/250599 [58:48<24:18, 62.55it/s]

EXCEPTION occured


 67%|████████████████████████████████████████████████████████████████████▋                                  | 167072/250599 [1:01:52<28:16, 49.24it/s]

EXCEPTION occured


 68%|█████████████████████████████████████████████████████████████████████▊                                 | 169703/250599 [1:02:51<21:36, 62.42it/s]

EXCEPTION occured


 69%|██████████████████████████████████████████████████████████████████████▊                                | 172136/250599 [1:03:45<25:53, 50.51it/s]

EXCEPTION occured


 71%|████████████████████████████████████████████████████████████████████████▉                              | 177366/250599 [1:05:53<19:43, 61.90it/s]

EXCEPTION occured


 75%|████████████████████████████████████████████████████████████████████████████▉                          | 187280/250599 [1:10:03<19:27, 54.21it/s]

EXCEPTION occured


 76%|██████████████████████████████████████████████████████████████████████████████▌                        | 191087/250599 [1:11:36<25:35, 38.76it/s]

EXCEPTION occured


 78%|████████████████████████████████████████████████████████████████████████████████                       | 194718/250599 [1:13:06<24:12, 38.47it/s]

EXCEPTION occured


 80%|██████████████████████████████████████████████████████████████████████████████████                     | 199624/250599 [1:14:57<31:44, 26.76it/s]

EXCEPTION occured


 80%|██████████████████████████████████████████████████████████████████████████████████▊                    | 201446/250599 [1:15:40<16:50, 48.66it/s]

EXCEPTION occured


 83%|█████████████████████████████████████████████████████████████████████████████████████▊                 | 208726/250599 [1:18:33<14:58, 46.58it/s]

EXCEPTION occured


 86%|████████████████████████████████████████████████████████████████████████████████████████▏              | 214415/250599 [1:20:38<16:56, 35.61it/s]

EXCEPTION occured


 88%|██████████████████████████████████████████████████████████████████████████████████████████▊            | 220968/250599 [1:22:52<10:18, 47.92it/s]

EXCEPTION occured


 90%|████████████████████████████████████████████████████████████████████████████████████████████▍          | 224842/250599 [1:24:15<14:09, 30.33it/s]

EXCEPTION occured


 90%|█████████████████████████████████████████████████████████████████████████████████████████████▏         | 226769/250599 [1:24:55<16:42, 23.77it/s]

EXCEPTION occured


 91%|█████████████████████████████████████████████████████████████████████████████████████████████▍         | 227233/250599 [1:25:05<06:27, 60.27it/s]

EXCEPTION occured


 91%|█████████████████████████████████████████████████████████████████████████████████████████████▉         | 228655/250599 [1:25:37<06:27, 56.68it/s]

EXCEPTION occured


 92%|██████████████████████████████████████████████████████████████████████████████████████████████▍        | 229895/250599 [1:26:07<06:31, 52.87it/s]

EXCEPTION occured


 92%|██████████████████████████████████████████████████████████████████████████████████████████████▉        | 231112/250599 [1:26:31<07:18, 44.43it/s]

EXCEPTION occured


 96%|██████████████████████████████████████████████████████████████████████████████████████████████████▊    | 240324/250599 [1:29:57<03:53, 43.96it/s]

EXCEPTION occured


 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 246432/250599 [1:32:07<01:58, 35.10it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 250599/250599 [1:33:41<00:00, 44.58it/s]


             id  parent_id  summarized  \
0       fco4eej  t3_ei8adu           0   
1       fco4k05  t3_ei8b9x           0   
2       fco4pcc  t3_ei8b9x           0   
3       fco4rd4  t3_ei8b9x           0   
4       fco580r  t3_ei8b9x           0   
...         ...        ...         ...   
165939  eczmpig  t3_abd2kg           1   
165940  eczo6x4  t3_abdlxy           1   
165941  eczojeb  t3_abd9m2           0   
165942  ed0j6ui  t3_abd9m2           1   
165943  ed27b08  t3_abdlxy           0   

                                                     text  length  
0       Lol I feel you I only eat like once a day on t...      36  
1       The Sims 3 and/or The Sims 4 are games I highl...      13  
2                    Have you had a go at Stardew Valley?       9  
3                                          Hollow Knight.       3  
4                                  Fallout is pretty good       4  
...                                                   ...     ...  
165939             

  comment_df = pd.read_csv('../data/reddit/raw/casual_conv_comments_{}.csv'.format(date_suffix))


(1051149, 10)
(303421, 10)


  0%|▏                                                                                                         | 681/303421 [00:13<1:58:55, 42.43it/s]

EXCEPTION occured


  0%|▌                                                                                                        | 1451/303421 [00:30<2:18:43, 36.28it/s]

EXCEPTION occured


  2%|██▏                                                                                                      | 6304/303421 [02:17<2:09:53, 38.12it/s]

EXCEPTION occured


  2%|██▌                                                                                                      | 7518/303421 [02:48<2:14:44, 36.60it/s]

EXCEPTION occured


  3%|███                                                                                                      | 8931/303421 [03:24<2:06:18, 38.86it/s]

EXCEPTION occured


  4%|███▊                                                                                                    | 11052/303421 [04:16<2:31:12, 32.23it/s]

EXCEPTION occured


  8%|███████▊                                                                                                | 22773/303421 [08:40<1:18:30, 59.58it/s]

EXCEPTION occured


  8%|████████▋                                                                                               | 25432/303421 [09:36<1:55:38, 40.07it/s]

EXCEPTION occured


  9%|█████████▌                                                                                              | 27984/303421 [10:29<1:12:10, 63.60it/s]

EXCEPTION occured


 12%|████████████                                                                                            | 35209/303421 [13:16<1:58:21, 37.77it/s]

EXCEPTION occured


 14%|██████████████▎                                                                                         | 41725/303421 [15:49<1:23:44, 52.09it/s]

EXCEPTION occured


 20%|█████████████████████                                                                                     | 60178/303421 [23:03<53:04, 76.38it/s]

EXCEPTION occured


 22%|██████████████████████▊                                                                                 | 66610/303421 [25:33<1:28:59, 44.35it/s]

EXCEPTION occured


 25%|██████████████████████████                                                                              | 76145/303421 [28:59<1:32:26, 40.97it/s]

EXCEPTION occured


 27%|████████████████████████████▌                                                                           | 83371/303421 [31:51<1:06:13, 55.37it/s]

EXCEPTION occured


 28%|█████████████████████████████▎                                                                          | 85493/303421 [32:41<1:09:30, 52.26it/s]

EXCEPTION occured


 32%|█████████████████████████████████▏                                                                      | 96942/303421 [37:07<1:21:43, 42.11it/s]

EXCEPTION occured


 33%|██████████████████████████████████                                                                     | 100479/303421 [38:40<1:13:57, 45.73it/s]

EXCEPTION occured


 35%|███████████████████████████████████▌                                                                   | 104751/303421 [40:33<1:34:39, 34.98it/s]

EXCEPTION occured


 39%|███████████████████████████████████████▊                                                               | 117159/303421 [46:08<1:27:35, 35.44it/s]

EXCEPTION occured


 39%|████████████████████████████████████████▍                                                              | 119171/303421 [47:00<1:58:45, 25.86it/s]

EXCEPTION occured


 40%|█████████████████████████████████████████                                                              | 120853/303421 [47:44<1:33:28, 32.55it/s]

EXCEPTION occured


 40%|█████████████████████████████████████████▍                                                             | 122046/303421 [48:21<1:41:46, 29.70it/s]

EXCEPTION occured


 41%|██████████████████████████████████████████▏                                                            | 124261/303421 [49:13<1:11:54, 41.53it/s]

EXCEPTION occured


 42%|███████████████████████████████████████████▊                                                             | 126568/303421 [50:14<59:06, 49.87it/s]

EXCEPTION occured


 42%|███████████████████████████████████████████▏                                                           | 127213/303421 [50:32<1:03:58, 45.90it/s]

EXCEPTION occured


 42%|████████████████████████████████████████████▏                                                            | 127703/303421 [50:44<57:59, 50.50it/s]

EXCEPTION occured


 43%|███████████████████████████████████████████▉                                                           | 129298/303421 [51:22<1:11:02, 40.85it/s]

EXCEPTION occured


 43%|████████████████████████████████████████████                                                           | 129902/303421 [51:39<3:04:37, 15.66it/s]

EXCEPTION occured


 44%|█████████████████████████████████████████████▏                                                         | 133163/303421 [53:07<2:14:59, 21.02it/s]

EXCEPTION occured


 44%|█████████████████████████████████████████████▍                                                         | 133745/303421 [53:23<1:10:10, 40.30it/s]

EXCEPTION occured


 45%|██████████████████████████████████████████████▎                                                        | 136427/303421 [54:33<1:36:10, 28.94it/s]

EXCEPTION occured


 47%|█████████████████████████████████████████████████▍                                                       | 142875/303421 [57:18<55:36, 48.12it/s]

EXCEPTION occured


 49%|█████████████████████████████████████████████████▋                                                   | 149165/303421 [1:00:06<1:10:27, 36.49it/s]

EXCEPTION occured


 51%|███████████████████████████████████████████████████▊                                                 | 155738/303421 [1:02:47<1:10:01, 35.15it/s]

EXCEPTION occured


 51%|████████████████████████████████████████████████████▊                                                  | 155750/303421 [1:02:47<56:09, 43.82it/s]

EXCEPTION occured


 54%|██████████████████████████████████████████████████████▏                                              | 162911/303421 [1:06:00<1:09:57, 33.47it/s]

EXCEPTION occured


 54%|██████████████████████████████████████████████████████▊                                              | 164631/303421 [1:06:45<1:00:09, 38.45it/s]

EXCEPTION occured


 59%|███████████████████████████████████████████████████████████▍                                         | 178646/303421 [1:12:45<1:15:33, 27.52it/s]

EXCEPTION occured


 60%|████████████████████████████████████████████████████████████▏                                        | 180930/303421 [1:13:44<1:08:15, 29.91it/s]

EXCEPTION occured


 60%|█████████████████████████████████████████████████████████████▍                                         | 180972/303421 [1:13:45<40:38, 50.22it/s]

EXCEPTION occured


 61%|██████████████████████████████████████████████████████████████▉                                        | 185477/303421 [1:15:40<38:38, 50.88it/s]

EXCEPTION occured


 62%|████████████████████████████████████████████████████████████████▎                                      | 189414/303421 [1:17:12<50:47, 37.42it/s]

EXCEPTION occured


 63%|████████████████████████████████████████████████████████████████▌                                      | 190277/303421 [1:17:36<37:53, 49.77it/s]

EXCEPTION occured


 67%|████████████████████████████████████████████████████████████████████▊                                  | 202644/303421 [1:22:24<30:58, 54.22it/s]

EXCEPTION occured


 67%|████████████████████████████████████████████████████████████████████▉                                  | 202990/303421 [1:22:34<53:35, 31.24it/s]

EXCEPTION occured


 69%|███████████████████████████████████████████████████████████████████████▎                               | 210069/303421 [1:25:37<47:20, 32.87it/s]

EXCEPTION occured


 71%|████████████████████████████████████████████████████████████████████████▌                              | 213915/303421 [1:27:15<34:59, 42.64it/s]

EXCEPTION occured


 71%|███████████████████████████████████████████████████████████████████████▋                             | 215527/303421 [1:27:56<1:03:16, 23.15it/s]

EXCEPTION occured


 74%|████████████████████████████████████████████████████████████████████████████▎                          | 224971/303421 [1:32:04<32:11, 40.62it/s]

EXCEPTION occured


 75%|████████████████████████████████████████████████████████████████████████████▉                          | 226515/303421 [1:32:42<32:37, 39.29it/s]

EXCEPTION occured


 75%|█████████████████████████████████████████████████████████████████████████████▍                         | 228276/303421 [1:33:34<28:48, 43.47it/s]

EXCEPTION occured


 79%|█████████████████████████████████████████████████████████████████████████████████                      | 238751/303421 [1:37:47<19:45, 54.56it/s]

EXCEPTION occured


 79%|█████████████████████████████████████████████████████████████████████████████████▌                     | 240405/303421 [1:38:30<19:24, 54.10it/s]

EXCEPTION occured


 81%|███████████████████████████████████████████████████████████████████████████████████▌                   | 246246/303421 [1:40:54<26:41, 35.70it/s]

EXCEPTION occured


 81%|███████████████████████████████████████████████████████████████████████████████████▉                   | 247133/303421 [1:41:14<29:45, 31.52it/s]

EXCEPTION occured


 83%|█████████████████████████████████████████████████████████████████████████████████████▋                 | 252556/303421 [1:43:34<15:18, 55.37it/s]

EXCEPTION occured


 84%|██████████████████████████████████████████████████████████████████████████████████████▍                | 254805/303421 [1:44:24<16:43, 48.45it/s]

EXCEPTION occured


 85%|███████████████████████████████████████████████████████████████████████████████████████▊               | 258770/303421 [1:46:11<27:38, 26.91it/s]

EXCEPTION occured


 88%|██████████████████████████████████████████████████████████████████████████████████████████▌            | 266934/303421 [1:49:25<12:36, 48.25it/s]

EXCEPTION occured


 89%|███████████████████████████████████████████████████████████████████████████████████████████▍           | 269472/303421 [1:50:29<22:36, 25.03it/s]

EXCEPTION occured


 90%|█████████████████████████████████████████████████████████████████████████████████████████████          | 274097/303421 [1:52:28<08:00, 60.98it/s]

EXCEPTION occured


 91%|█████████████████████████████████████████████████████████████████████████████████████████████▊         | 276286/303421 [1:53:24<12:57, 34.90it/s]

EXCEPTION occured


 91%|█████████████████████████████████████████████████████████████████████████████████████████████▉         | 276624/303421 [1:53:35<10:45, 41.53it/s]

EXCEPTION occured


 91%|██████████████████████████████████████████████████████████████████████████████████████████████         | 277202/303421 [1:53:49<08:59, 48.62it/s]

EXCEPTION occured


 92%|██████████████████████████████████████████████████████████████████████████████████████████████▊        | 279332/303421 [1:54:42<05:34, 72.11it/s]

EXCEPTION occured


 93%|███████████████████████████████████████████████████████████████████████████████████████████████▊       | 282351/303421 [1:55:50<07:42, 45.59it/s]

EXCEPTION occured


 94%|████████████████████████████████████████████████████████████████████████████████████████████████▍      | 284259/303421 [1:56:33<06:53, 46.38it/s]

EXCEPTION occured


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████▍     | 287030/303421 [1:57:47<05:28, 49.92it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 303421/303421 [2:04:43<00:00, 40.55it/s]


             id  parent_id  summarized  \
0       ecyoxpd  t3_aba9kt           1   
1       ecypsb2  t3_abadvk           1   
2       ecyqaf8  t3_abadvk           1   
3       ecyra3n  t3_abadvk           1   
4       ecyrh6q  t3_abadvk           1   
...         ...        ...         ...   
203258  ds0yawc  t3_7nc8k5           1   
203259  ds139pg  t3_7ncafx           1   
203260  ds1573m  t3_7ncaro           1   
203261  ds16j13  t3_7nc8k5           1   
203262  ds19dod  t3_7nc8k5           1   

                                                     text  length  
0                           First of all, happy new year!       8  
1       I am THRILLED that I get to spend the night ha...      19  
2       You should consider listening to Morrissey and...      13  
3                                         Happy new year!       4  
4       Mostly the summer holidays and going to Michig...      20  
...                                                   ...     ...  
203258  My suggesti

  2%|█▊                                                                                                       | 4315/246046 [01:51<1:26:23, 46.63it/s]

EXCEPTION occured


  6%|██████▎                                                                                                 | 14889/246046 [06:00<1:54:32, 33.64it/s]

EXCEPTION occured


  7%|██████▊                                                                                                 | 16025/246046 [06:34<1:23:46, 45.76it/s]

EXCEPTION occured


  8%|████████▏                                                                                               | 19356/246046 [08:00<1:18:53, 47.89it/s]

EXCEPTION occured


 13%|█████████████▌                                                                                          | 31963/246046 [13:16<1:31:30, 38.99it/s]

EXCEPTION occured


 13%|█████████████▌                                                                                          | 32040/246046 [13:18<1:07:11, 53.08it/s]

EXCEPTION occured


 18%|██████████████████▋                                                                                       | 43433/246046 [17:43<48:58, 68.94it/s]

EXCEPTION occured


 18%|██████████████████▉                                                                                     | 44829/246046 [18:13<1:25:32, 39.20it/s]

EXCEPTION occured


 21%|█████████████████████▍                                                                                  | 50576/246046 [20:29<1:38:12, 33.17it/s]

EXCEPTION occured


 21%|██████████████████████▏                                                                                 | 52381/246046 [21:12<3:03:41, 17.57it/s]

EXCEPTION occured


 31%|█████████████████████████████████▏                                                                        | 76974/246046 [30:44<45:01, 62.59it/s]

EXCEPTION occured


 34%|███████████████████████████████████▌                                                                      | 82589/246046 [32:51<51:25, 52.97it/s]

EXCEPTION occured


 34%|███████████████████████████████████▋                                                                      | 82911/246046 [32:58<58:18, 46.63it/s]

EXCEPTION occured


 34%|████████████████████████████████████▏                                                                     | 84006/246046 [33:24<37:55, 71.20it/s]

EXCEPTION occured


 34%|████████████████████████████████████▎                                                                     | 84296/246046 [33:32<58:22, 46.18it/s]

EXCEPTION occured


 35%|████████████████████████████████████▊                                                                   | 87232/246046 [34:39<1:44:41, 25.28it/s]

EXCEPTION occured


 36%|████████████████████████████████████▉                                                                   | 87480/246046 [34:46<1:13:30, 35.95it/s]

EXCEPTION occured


 36%|█████████████████████████████████████▊                                                                  | 89320/246046 [35:30<1:02:45, 41.62it/s]

EXCEPTION occured


 37%|██████████████████████████████████████▊                                                                 | 91830/246046 [36:26<1:05:30, 39.24it/s]

EXCEPTION occured


 39%|█████████████████████████████████████████▏                                                                | 95697/246046 [37:46<28:29, 87.94it/s]

EXCEPTION occured


 43%|█████████████████████████████████████████████▌                                                           | 106852/246046 [42:01<36:46, 63.09it/s]

EXCEPTION occured


 44%|█████████████████████████████████████████████▉                                                           | 107765/246046 [42:21<50:12, 45.90it/s]

EXCEPTION occured


 45%|███████████████████████████████████████████████▌                                                         | 111322/246046 [43:34<47:54, 46.87it/s]

EXCEPTION occured


 46%|██████████████████████████████████████████████▊                                                        | 111961/246046 [43:49<1:12:42, 30.74it/s]

EXCEPTION occured


 46%|██████████████████████████████████████████████▉                                                        | 112129/246046 [43:55<1:02:51, 35.51it/s]

EXCEPTION occured


 47%|████████████████████████████████████████████████▌                                                      | 115881/246046 [45:29<1:03:25, 34.20it/s]

EXCEPTION occured


 59%|██████████████████████████████████████████████████████████████▍                                          | 146205/246046 [56:53<46:43, 35.61it/s]

EXCEPTION occured


 60%|███████████████████████████████████████████████████████████████▎                                         | 148222/246046 [57:41<35:05, 46.47it/s]

EXCEPTION occured


 61%|███████████████████████████████████████████████████████████████▊                                         | 149572/246046 [58:12<38:47, 41.46it/s]

EXCEPTION occured


 61%|███████████████████████████████████████████████████████████████▉                                         | 149848/246046 [58:19<37:04, 43.25it/s]

EXCEPTION occured


 62%|████████████████████████████████████████████████████████████████▉                                        | 152086/246046 [59:10<17:11, 91.08it/s]

EXCEPTION occured


 66%|████████████████████████████████████████████████████████████████████▏                                  | 162906/246046 [1:03:51<33:21, 41.54it/s]

EXCEPTION occured


 71%|█████████████████████████████████████████████████████████████████████████                              | 174597/246046 [1:08:51<26:16, 45.32it/s]

EXCEPTION occured


 72%|█████████████████████████████████████████████████████████████████████████▉                             | 176764/246046 [1:09:46<22:26, 51.47it/s]

EXCEPTION occured


 74%|████████████████████████████████████████████████████████████████████████████▎                          | 182183/246046 [1:11:52<32:52, 32.37it/s]

EXCEPTION occured


 75%|█████████████████████████████████████████████████████████████████████████████▏                         | 184350/246046 [1:12:43<29:20, 35.04it/s]

EXCEPTION occured


 75%|█████████████████████████████████████████████████████████████████████████████▎                         | 184762/246046 [1:12:52<34:20, 29.75it/s]

EXCEPTION occured


 77%|██████████████████████████████████████████████████████████████████████████████▉                        | 188435/246046 [1:14:24<19:06, 50.27it/s]

EXCEPTION occured


 77%|███████████████████████████████████████████████████████████████████████████████                        | 188867/246046 [1:14:36<23:43, 40.16it/s]

EXCEPTION occured


 81%|███████████████████████████████████████████████████████████████████████████████████▏                   | 198822/246046 [1:18:26<21:49, 36.06it/s]

EXCEPTION occured


 81%|███████████████████████████████████████████████████████████████████████████████████▍                   | 199229/246046 [1:18:35<16:58, 45.95it/s]

EXCEPTION occured


 82%|████████████████████████████████████████████████████████████████████████████████████▋                  | 202419/246046 [1:19:47<15:29, 46.94it/s]

EXCEPTION occured


 84%|██████████████████████████████████████████████████████████████████████████████████████                 | 205573/246046 [1:20:59<13:28, 50.07it/s]

EXCEPTION occured


 85%|███████████████████████████████████████████████████████████████████████████████████████▊               | 209897/246046 [1:22:45<10:25, 57.83it/s]

EXCEPTION occured


 92%|██████████████████████████████████████████████████████████████████████████████████████████████▍        | 225549/246046 [1:28:49<07:04, 48.31it/s]

EXCEPTION occured


 92%|███████████████████████████████████████████████████████████████████████████████████████████████        | 226959/246046 [1:29:23<05:06, 62.27it/s]

EXCEPTION occured


 92%|███████████████████████████████████████████████████████████████████████████████████████████████        | 227168/246046 [1:29:26<04:34, 68.75it/s]

EXCEPTION occured


 94%|████████████████████████████████████████████████████████████████████████████████████████████████▋      | 230925/246046 [1:30:48<05:12, 48.38it/s]

EXCEPTION occured


 97%|███████████████████████████████████████████████████████████████████████████████████████████████████▉   | 238872/246046 [1:33:40<02:16, 52.64it/s]

EXCEPTION occured


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 246046/246046 [1:36:23<00:00, 42.54it/s]


             id  parent_id  summarized  \
0       ds021wm  t3_7n9jvs           0   
1       ds025kc  t3_7n9jvs           1   
2       ds02aay  t3_7n9jvs           1   
3       ds02coq  t3_7n9jvs           0   
4       ds02ksr  t3_7n9jvs           1   
...         ...        ...         ...   
171230  dbv7exv  t3_5ldl3q           1   
171231  dbv7lns  t3_5ld63x           0   
171232  dbvdday  t3_5ldlxh           1   
171233  dbvzszy  t3_5ldlxh           1   
171234  dbwnr63  t3_5ldln5           1   

                                                     text  length  
0                                          I feel ya, bro       5  
1                 I just completed watching Black Mirror.       7  
2                                       Alone in a crowd.       5  
3           Playing DOOM all night long, and alone too :)      10  
4                                          Happy new year       3  
...                                                   ...     ...  
171230             

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93/93 [00:01<00:00, 48.93it/s]


         id  parent_id  summarized  \
0   dbumpog  t3_5lcee8           1   
1   dbumrm1  t3_5lce7x           1   
2   dbun0y4  t3_5lce7x           1   
3   dbundan  t3_5lce7x           1   
4   dbuo53y  t3_5l3q41           1   
..      ...        ...         ...   
62  dc6qxsl  t3_4ze20k           1   
63  dc9ec7n  t3_50ya8p           0   
64  dcvb5mp  t3_50mj2d           0   
65  de08pei  t3_539irb           1   
66  dehvel6  t3_51qxh2           1   

                                                 text  length  
0                                   Stay safe though.       4  
1                                             HOORAY!       2  
2            Hope that you're having a nice time too.      10  
3                                     Happy new year!       4  
4                         Check out the wiki for info       6  
..                                                ...     ...  
62                    "Peep Show" gets me every time.       9  
63  Around the World in 80 Days

# Finalize the Submissions and Comments

In [38]:
submission_filtered_df = pd.read_csv('../data/reddit/filtered/casual_conv_submissions_{}.csv'.format(date_suffix))

In [39]:
comment_filtered_df = pd.read_csv('../data/reddit/filtered/casual_conv_comments_{}.csv'.format(date_suffix))
comment_filtered_parent_ids = comment_filtered_df['parent_id'].tolist()
comment_filtered_parent_ids = [x[3:] for x in comment_filtered_parent_ids]

In [40]:
submission_filtered_df = submission_filtered_df[submission_filtered_df['id'].isin(comment_filtered_parent_ids)]
submission_filtered_df.shape

(7941, 6)

In [41]:
final_cols = ['sub_id', 'sub_summarized', 'sub_from', 'sub_text', 'sub_root', 'sub_length',
              'com_id', 'com_summarized', 'com_text', 'com_length']
final_dict = {col: [] for col in final_cols}

In [42]:
for i in tqdm(range(submission_filtered_df.shape[0])):
    sub_id = submission_filtered_df.iloc[i]['id']
    sub_summarized = submission_filtered_df.iloc[i]['summarized']
    sub_from = submission_filtered_df.iloc[i]['from']
    sub_text = submission_filtered_df.iloc[i]['text']
    sub_root = submission_filtered_df.iloc[i]['root']
    sub_length = submission_filtered_df.iloc[i]['length']

    comment_filtered_df_sub = comment_filtered_df[comment_filtered_df['parent_id'] == 't3_' + sub_id]
    for j in range(comment_filtered_df_sub.shape[0]):
        final_dict['sub_id'].append(sub_id)
        final_dict['sub_summarized'].append(sub_summarized)
        final_dict['sub_from'].append(sub_from)
        final_dict['sub_text'].append(sub_text)
        final_dict['sub_root'].append(sub_root)
        final_dict['sub_length'].append(sub_length)
        final_dict['com_id'].append(comment_filtered_df_sub.iloc[j]['id'])
        final_dict['com_summarized'].append(comment_filtered_df_sub.iloc[j]['summarized'])
        final_dict['com_text'].append(comment_filtered_df_sub.iloc[j]['text'])
        final_dict['com_length'].append(comment_filtered_df_sub.iloc[j]['length'])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7941/7941 [00:46<00:00, 170.12it/s]


In [43]:
final_df = pd.DataFrame(final_dict)

In [44]:
final_df

Unnamed: 0,sub_id,sub_summarized,sub_from,sub_text,sub_root,sub_length,com_id,com_summarized,com_text,com_length
0,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6trwr,1,Take it as a compliment.,6
1,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6tw3v,1,However working at a bar where sex-charged men...,14
2,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6uegk,1,Why do people get bent out of shape about hair...,15
3,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6xlf7,1,Will appearing more attractive to customers in...,16
4,r2v7x0,0,title,My boss told me that during the time my hair w...,make,21,hm6zt2c,1,are you suggesting that its a problem for peop...,28
...,...,...,...,...,...,...,...,...,...,...
53689,n8sdhl,0,selftext,I never really got to do things like this when...,get,36,gxkseja,1,Nice man,2
53690,n8sdhl,0,selftext,I never really got to do things like this when...,get,36,gxocvzh,1,"I've never lived anywhere with fireflies, and ...",18
53691,n8sbcb,0,title,After the worst 4 years of my life during coll...,move,22,gxk4z84,0,Not a bad place to live if you can solve for COL,12
53692,n8sbcb,0,title,After the worst 4 years of my life during coll...,move,22,gxk5bpg,1,Good luck on your new adventure!,7


In [45]:
final_df.to_csv('../data/reddit/matched/casual_conv_{}.csv'.format(date_suffix), index = False)

## Batch Process

In [44]:
date_suffices = ['20210501_20211231', '20210101_20210430', '20200101_20201231',
                 '20190101_20191231', '20180101_20181231', '20170101_20171231',
                 '20160101_20161231']
for date_suffix in date_suffices:
    submission_filtered_df = pd.read_csv('../data/reddit/filtered_q/casual_conv_submissions_{}.csv'.format(date_suffix))
    comment_filtered_df = pd.read_csv('../data/reddit/filtered_q/casual_conv_comments_{}.csv'.format(date_suffix))
    comment_filtered_parent_ids = comment_filtered_df['parent_id'].tolist()
    comment_filtered_parent_ids = [x[3:] for x in comment_filtered_parent_ids]
    submission_filtered_df = submission_filtered_df[submission_filtered_df['id'].isin(comment_filtered_parent_ids)]
    print(submission_filtered_df.shape)
    final_cols = ['sub_id', 'sub_summarized', 'sub_from', 'sub_text', 'sub_root', 'sub_length',
                  'com_id', 'com_summarized', 'com_text', 'com_length']
    final_dict = {col: [] for col in final_cols}
    for i in tqdm(range(submission_filtered_df.shape[0])):
        sub_id = submission_filtered_df.iloc[i]['id']
        sub_summarized = submission_filtered_df.iloc[i]['summarized']
        sub_from = submission_filtered_df.iloc[i]['from']
        sub_text = submission_filtered_df.iloc[i]['text']
        sub_root = submission_filtered_df.iloc[i]['root']
        sub_length = submission_filtered_df.iloc[i]['length']

        comment_filtered_df_sub = comment_filtered_df[comment_filtered_df['parent_id'] == 't3_' + sub_id]
        for j in range(comment_filtered_df_sub.shape[0]):
            final_dict['sub_id'].append(sub_id)
            final_dict['sub_summarized'].append(sub_summarized)
            final_dict['sub_from'].append(sub_from)
            final_dict['sub_text'].append(sub_text)
            final_dict['sub_root'].append(sub_root)
            final_dict['sub_length'].append(sub_length)
            final_dict['com_id'].append(comment_filtered_df_sub.iloc[j]['id'])
            final_dict['com_summarized'].append(comment_filtered_df_sub.iloc[j]['summarized'])
            final_dict['com_text'].append(comment_filtered_df_sub.iloc[j]['text'])
            final_dict['com_length'].append(comment_filtered_df_sub.iloc[j]['length'])
    final_df = pd.DataFrame(final_dict)
    print(final_df.shape)
    final_df.to_csv('../data/reddit/matched_q/casual_conv_{}.csv'.format(date_suffix), index = False)

(10084, 6)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 10084/10084 [01:11<00:00, 141.93it/s]


(73077, 10)
(4738, 6)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4738/4738 [00:20<00:00, 233.32it/s]


(28280, 10)
(35337, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 35337/35337 [08:02<00:00, 73.31it/s]


(194411, 10)
(30205, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 30205/30205 [05:56<00:00, 84.74it/s]


(165944, 10)
(32628, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 32628/32628 [08:16<00:00, 65.66it/s]


(203263, 10)
(27209, 6)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 27209/27209 [05:48<00:00, 78.10it/s]


(171235, 10)
(46, 6)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 807.78it/s]

(67, 10)





# Sample Some Pairs

In [31]:
N = final_df.shape[0]
indices = np.sort(np.random.choice(N, 1000, replace = False))
final_df_sample = final_df.iloc[indices]
final_df_sample.to_csv('pushshift/final/casual_conv_20200101_20201231_sample.csv', index = False)

# Some Tests

In [13]:
nlp = spacy.load('en_core_web_sm')

In [23]:
doc = nlp("A silly question")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

question
NOUN


In [25]:
doc = nlp("I am happy about it.")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

am
AUX


In [26]:
doc = nlp("I went to the market today")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

went
VERB


In [27]:
doc = nlp("Just worked more than I ever have in my life")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

worked
VERB


In [29]:
doc = nlp("Happy New Year!")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)

Year
PROPN


In [46]:
doc = nlp("Has she done your homework")
for sent in doc.sents:
    print(sent.root)
    print(sent.root.pos_)
    print(sent[0].lemma_)
    for token in sent:
        print(token, token.lemma_, token.pos_)

done
VERB
have
Has have AUX
she -PRON- PRON
done do VERB
your -PRON- DET
homework homework NOUN


In [35]:
doc = nlp("Did you finish your homework")
sents = [sent for sent in doc.sents]
sent = sents[0]
print(sent)
print(sent.root.pos_)
print(sent[0].lemma_)

Did you finish your homework
VERB
do


In [25]:
doc = nlp("I just gave you the paper")
sents = [sent for sent in doc.sents]
sent = sents[0]
print(len(sent), sent.root, sent.root.pos_, sent.root.lemma_)
print(sent[0].lemma_, sent[0].pos_)

6 gave VERB give
-PRON- PRON


In [13]:
r_pattern = re.compile(r'(^| )\/?r\/[^ ]*')
print(r_pattern.search('ashjs/r/haha__ ss'))

None


In [18]:
re.sub(r'(\([^\(\)]*\))|(\[[^\[\]]*\])', '', 'how about [ashs] and and [[ss]')

'how about  and and ['