## Imports

In [12]:
from bs4 import BeautifulSoup, Tag

import nltk.data

import re
import sys
import string
import json

from datetime import datetime
from dateutil.parser import parse

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pdb
from pymongo import MongoClient
from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateMany, UpdateOne
from pprint import pprint

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from collections import OrderedDict

In [5]:
client = MongoClient()
db = client.polymedia
db.list_collection_names()

['subreddit_polyamory', 'subreddit_relationships', 'temp', 'pitm', 'test']

In [7]:
r_poly = db.subreddit_polyamory

# Text Processing Utilities

In [72]:
def escape_ansi(line):
    ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]+')
    out = ansi_escape.sub('', line).replace('\n',' . ')
    out = re.sub('[%s]' % re.escape(string.punctuation), ' ', out.replace('--',''))
    return out

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace('\t',' <stop>')
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    sentences = [escape_ansi(s) for s in sentences if len(s)>1]
    return sentences

def clean_text(texts, lancaster=False):
    
    def stem_words(seq):
        
#         print('seq to stem:')
#         print(seq)
        if lancaster:
            st = LancasterStemmer()
            
            stemmed = " ".join([st.stem(w) for w in seq.split()])
#             print(stemmed)
            return stemmed
    
    def process(text):
        
        rep = {"\'": "",
               "\xa0": " ",
               "  ": ' ',
               "\n":".",
               "\t":".",
               "\x97": " "
              }
        rep = OrderedDict((re.escape(k), v) for k, v in rep.items()) 
        pattern = re.compile("|".join(rep.keys()))
        text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
        
        clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        clean_text = clean_text.lower()
        clean_text = re.sub('\w*\d\w*', ' ', clean_text)
#         clean_text = re.sub('[\n|\t]', ' ', clean_text)

        if lancaster:
            return stem_words(clean_text)
        else:
            return clean_text

    if type(texts) == list:
        return [process(t) for t in texts]

    else:
        return process(texts)

# Generate Corpora, split by subreddits

## Merge collections on thread level

In [45]:
pprint(list(r_poly.find().limit(4)))

[{'_id': ObjectId('5dc1c32f465410d3bf138f90'),
  'comments': [{'comment_body': "It's not the relationship style; it's the "
                                "people you're in relationships with.\n"
                                '\n'
                                '100%',
                'doc_class': 'comment',
                'num_replies': 0,
                'replies': [],
                'utc_comment': datetime.datetime(2019, 11, 5, 4, 57, 36)},
               {'comment_body': 'I think you make a really great point here. I '
                                'believe that any time anyone makes too much '
                                'out of one aspect of their life, and makes it '
                                'to be everything, that aspect no longer is '
                                "rewarding. It's the same for polyamory. "
                                "Polyamory isn't really meant to be the crux "
                                'in which you define every aspect of your

In [66]:
pipeline = [
    {'$project': {'_id': 0, 'post_body': 1, 'comment_body': 1, 'utc_submission':1,
                  'comments': {'comment_body':1},
                  'replies':{'reply_body':1}
                 }
    }
#     {'$unwind': '$comments'},
#     {'$limit': 1}
]
threads_r_poly = list(r_poly.aggregate(pipeline))

In [61]:
len(threads_r_poly)

17503

In [65]:
sample = threads_r_poly[0]
sample

{'utc_submission': datetime.datetime(2019, 11, 5, 4, 13, 35),
 'post_body': 'I’ve seen a couple of posts stating that sometimes poly isn’t great. Maybe you’re going through a rough patch, or maybe your can’t get a date. Other posts I’ve seen here and other places include people talking about how they can’t do poly any more because someone lied or cheated.\n\nThe reality is that poly is *always* awesome, but not all people and relationships are. Sometimes we can have a string of relationships that just cause pain. Sometimes we can end up with three consecutive people who just use and abuse us. That’s not poly. The same things can happen in monogamy. Those circumstances arise because of the people involved, or the relationship between certain people. Not everything is going to be perfect. Hell, you may not ever find your subjective “perfect.”\n\nFor those of you out there feeling like you’re really upset because you can’t get a date on any of the three dating apps you’re on, or maybe you

In [73]:
def flatten_thread(thread):
    out = {}
    
    text_concat = ''
    out['date'] = thread['utc_submission']
    
    text_concat += ' . ' + thread['post_body'] + ' . '
    
    for comment in thread['comments']:
        text_concat += ' . ' + comment['comment_body'] + ' . '
        try:
            for reply in comment['replies']:
                text_concat += ' . ' + reply['reply_body'] + ' . '
        except:
            pass

    out['text_concat'] = escape_ansi(text_concat)
    
    return out
        
flatten_thread(sample)

{'date': datetime.datetime(2019, 11, 5, 4, 13, 35),
 'text_concat': '   I’ve seen a couple of posts stating that sometimes poly isn’t great  Maybe you’re going through a rough patch  or maybe your can’t get a date  Other posts I’ve seen here and other places include people talking about how they can’t do poly any more because someone lied or cheated       The reality is that poly is  always  awesome  but not all people and relationships are  Sometimes we can have a string of relationships that just cause pain  Sometimes we can end up with three consecutive people who just use and abuse us  That’s not poly  The same things can happen in monogamy  Those circumstances arise because of the people involved  or the relationship between certain people  Not everything is going to be perfect  Hell  you may not ever find your subjective “perfect ”      For those of you out there feeling like you’re really upset because you can’t get a date on any of the three dating apps you’re on  or maybe you’

In [76]:
flattened = []
for thread in threads_r_poly:
    flattened += [flatten_thread(thread)]

In [75]:
import pickle

In [77]:
with open('../data/exports/by_thread/r_poly_2012_and_2019.pkl', 'wb') as file:
    pickle.dump(flattened, file)