## Imports

In [1]:
from bs4 import BeautifulSoup

import re
import sys
import string
import json

from datetime import datetime
from dateutil.parser import parse

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pdb
from pymongo import MongoClient
from pymongo import InsertOne, DeleteOne, ReplaceOne, UpdateMany, UpdateOne
from pprint import pprint

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from collections import OrderedDict

## Load Data

In [2]:
client = MongoClient()
db = client.polymedia

In [4]:
db.list_collection_names()

['subreddit_polyamory', 'temp', 'pitm', 'test']

In [5]:
pitm = db.pitm

In [6]:
pprint(pitm.find_one().keys())

dict_keys(['_id', 'raw_post_html', 'raw_comments_html', 'num_comments', 'post_date_string', 'post_date', 'post_title', 'quotes', 'editorial_text', 'labels'])


## Scraping Utility Functions

In [87]:
escape_ansi(')  -------------  Cartório de São Paulo registra união estável de três pessoas.').replace('-','')

')    Cartório de São Paulo registra união estável de três pessoas.'

In [94]:
def escape_ansi(line):
    ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]+')
    out = ansi_escape.sub('', line)
    out = re.sub('[%s]' % re.escape(string.punctuation), ' ', out.replace('--',''))
    return out


def make_soup(url):
    webpage_response = requests.get(url)
    return BeautifulSoup(webpage_response.content, 'html.parser')


def get_all_a_hrefs(soup, selector=''):
    """
    Grabs all href attribute values from a tags
    contained inside given selector.
    """
    links = []
    if len(selector) > 0:
        prefix = selector + ' '
    else:
        prefix = ''
    for a in soup.select(prefix + 'a', href=True):
        links += [a['href']]
    return list(set(links))


def is_link_date_archive(link):
    """
    Identifies links with potential date value in url path.
    """
    # print(link)
    # print(re.search('.+\/[0-9]{4}\/[0-9]{2}\/', link))
    if re.search('.+[0-9]{4}/[0-9]{2}.+', link) is None:
        return False
    else:
        return True


def list_outbound_links(page_url, domain):
    soup = make_soup(page_url)
    links = get_all_a_hrefs(soup)
    return [link for link in links if not((re.search('.+domain.+', link)))]

def get_post_permalinks(archive_url):
    soup = make_soup(archive_url)
    permalinks = soup.select('a[title="permanent link"]')
    return [p['href'] for p in permalinks]

def scrape_posts(permalinks):
    
    cols = []

    # Iterate through posts and create dictionary:
    for p in permalinks:
        
        soup = make_soup(p)
        date_header = soup.select('.date-header')[0]
        post = soup.select('.post')[0]
        
        try:
            comments = soup.select('#comments')[0]
            num_comments = int(comments.select('h4')[0].get_text().split()[0])
            comment_blocks = [c.text.strip()
                      for c in post.find_all(".comment-body")]
        except:
            num_comments = 0
            comment_blocks = []

        post_col = {}

        # Save raw HTML for later, just in case:
        post_col['raw_post_html'] = post
        post_col['raw_comments_html'] = comments
        
        # Save comments:
        post_col['comments'] = comment_blocks
        
        # Post meta:
        post_col['num_comments'] = num_comments
        post_col['post_date_string'] = date_header.get_text()
        post_col['post_date'] = parse(date_header.get_text())
        post_col['post_title'] = post.select('h3.post-title')[0].get_text().strip()

        # Collect all blockquotes from news sources:
        quotes = post.find_all("blockquote")
        quote_texts = [q.get_text().strip()
                  for q in quotes]
        post_col['quotes'] = quote_texts
        

        
        # Remove quotes from the main HTML, leaving blog author's commentary:
        for q in quotes:
            q.extract()
        
        post_col['editorial_text'] = post.text.strip()

        cols += [post_col]

    return cols

## Text Pre-proccesing

In [63]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace('\t',' <stop>')
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    sentences = [escape_ansi(s) for s in sentences if len(s)>1]
    return sentences

In [9]:
def clean_text(texts, lancaster=False):
    
    def stem_words(seq):
        
#         print('seq to stem:')
#         print(seq)
        if lancaster:
            st = LancasterStemmer()
            
            stemmed = " ".join([st.stem(w) for w in seq.split()])
#             print(stemmed)
            return stemmed
    
    def process(text):
        
        rep = {"\'": "",
               "\xa0": " ",
               "  ": ' ',
               "\n":".",
               "\t":".",
               "\x97": " "
              }
        rep = OrderedDict((re.escape(k), v) for k, v in rep.items()) 
        pattern = re.compile("|".join(rep.keys()))
        text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
        
        clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        clean_text = clean_text.lower()
        clean_text = re.sub('\w*\d\w*', ' ', clean_text)
#         clean_text = re.sub('[\n|\t]', ' ', clean_text)

        if lancaster:
            return stem_words(clean_text)
        else:
            return clean_text

    if type(texts) == list:
        return [process(t) for t in texts]

    else:
        return process(texts)

## Re-process PITM Quote / Editorial Text

I should have split by sentence level...

In [24]:
from bs4 import Tag

In [104]:
pitm.find_one({}).keys()

dict_keys(['_id', 'raw_post_html', 'raw_comments_html', 'num_comments', 'post_date_string', 'post_date', 'post_title', 'quotes', 'editorial_text', 'labels', 'post_sentences', 'all_post_sentences', 'quotes_by_sentence', 'editorial_sentences'])

In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [115]:
posts = list(pitm.find({},{'raw_post_html':1, 'all_post_sentences':1, 'quotes_by_sentence':1, 'editorial_sentences':1, 'quotes':1}))
posts[50]

{'_id': ObjectId('5dbf0bda858e4cbd747f5fb1'),
 'raw_post_html': '<div class="post"><a name="595843825317736586"></a>\n<h3 class="post-title">\n                      \t \n                      \t "A polyamorous quad welcomes their first child"\n\t                       \n                          </h3>\n<div class="post-body">\n<p><div style="clear:both;"></div><span style="font-size: 100%; font-weight: bold; font-style: italic; font-size:116%;">Offbeat Mama<br/></span><br><a href="http://media.offbeatmama.com/wp-content/blogs.dir/2/files/2012/09/Connor-and-Family1-500x386.jpg" style="font-size: 100%; "><img alt="" border="0" src="https://lh3.googleusercontent.com/proxy/Ox0RixY-IYB2mBiGOtrIjkUaPUzKwv_X563_bnDIwfxiY7QsobxYx3_b_YN-LqVgqgcKqvE0r5UgXpNrfcvUguOLMQttBDsG5WxCcdyjZkjvsphGRJo3qAqjYzzqUkwWFsS6L264jRxExRm4hUDqpKCEXAQ=s0-d" style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 500px; height: 386px;"/></a>\xa0\xa0\xa0\xa0\xa0\xa0<i><span st

In [111]:
for post in posts[:5]:
    raw_post_html = post['raw_post_html']
    
    soup = BeautifulSoup(raw_post_html)
    
    quotes = soup.find_all("blockquote")
    
    for s in soup.findAll('br'):
        s.replaceWith(" ")
    
    # Sentences as documents for each post
    sentences = split_into_sentences(soup.text)
    pitm.update_one({'_id': post['_id']}, {"$set": { "all_post_sentences": sentences} })
    
    # get just the quotes
    quotes = soup.find_all("blockquote")
    print(quotes)
    quotes_list = []
    for i, quote in enumerate(quotes):
        quotes_dict = {}
        sentences = split_into_sentences(quote.text)
        quotes_dict['sentences'] = sentences
        print(quotes_dict)
        quotes_list += [quotes_dict]
        
    pitm.update_one({'_id': post['_id']}, {"$set": { "quotes_by_sentence": quotes_list} })
        
    for q in quotes:
        q.extract()
    
    sentences = split_into_sentences(soup.text)
    pitm.update_one({'_id': post['_id']}, {"$set": { "editorial_sentences": sentences} })
    

[]
[]
[]
[]
[]


In [43]:
raw_post_html = pitm.find_one({},{'_id':0, 'raw_post_html':1})['raw_post_html']
soup = BeautifulSoup(raw_post_html)
for s in soup.findAll('br'):
    s.replaceWith(" ")
split_into_sentences(soup.text)

['New Jersey newspaper columnist  comes around',
 "The Trentonian  A month ago I mentioned a New Jersey newspaper columnist's dismissive freakout about the concept behind Showtime's Polyamory: Married and Dating.",
 "Apparently some of you wrote him well-considered letters, enough that he's now written a second, much more conciliatory column about your responses.",
 'What struck him in particular was how closeted poly people feel they need to be, unlike people who just date around.',
 'See the original article (Aug.',
 '28, 2012).']

In [36]:
soup

<html><body><div class="post"><a name="3690396103602027663"></a>
<h3 class="post-title">
                      	 
                      	 New Jersey newspaper columnist <br/>comes around
	                       
                          </h3>
<div class="post-body">
<p></p><div style="clear:both;"></div><span style="FONT-WEIGHT: bold; FONT-STYLE: italic; FONT-SIZE: 116%">The Trentonian<br/></span><br/>A month ago I mentioned a New Jersey newspaper columnist's <a href="http://www.trentonian.com/article/20120807/OPINION03/120809778/sex-with-your-spouse-his-spouse-him-your-neighbor--&amp;pager=2" target="new_window">dismissive freakout</a> about the concept behind Showtime's <i>Polyamory: Married and Dating.</i> Apparently some of you wrote him well-considered letters, enough that he's now written a second, much more conciliatory column about your responses. What struck him in particular was how closeted poly people feel they need to be, unlike people who just date around.<br/><br/><br/>

In [37]:
for s in soup.findAll('br'):
    s.replaceWith(" . ")
soup

<html><body><div class="post"><a name="3690396103602027663"></a>
<h3 class="post-title">
                      	 
                      	 New Jersey newspaper columnist  . comes around
	                       
                          </h3>
<div class="post-body">
<p></p><div style="clear:both;"></div><span style="FONT-WEIGHT: bold; FONT-STYLE: italic; FONT-SIZE: 116%">The Trentonian . </span> . A month ago I mentioned a New Jersey newspaper columnist's <a href="http://www.trentonian.com/article/20120807/OPINION03/120809778/sex-with-your-spouse-his-spouse-him-your-neighbor--&amp;pager=2" target="new_window">dismissive freakout</a> about the concept behind Showtime's <i>Polyamory: Married and Dating.</i> Apparently some of you wrote him well-considered letters, enough that he's now written a second, much more conciliatory column about your responses. What struck him in particular was how closeted poly people feel they need to be, unlike people who just date around. .  .  . See the <a h

In [16]:
split_into_sentences(soup.text)

['New Jersey newspaper columnist comes around',
 "The TrentonianA month ago I mentioned a New Jersey newspaper columnist's dismissive freakout about the concept behind Showtime's Polyamory: Married and Dating.",
 "Apparently some of you wrote him well-considered letters, enough that he's now written a second, much more conciliatory column about your responses.",
 'What struck him in particular was how closeted poly people feel they need to be, unlike people who just date around.',
 'See the original article (Aug.',
 '28, 2012).']

## Collect Corpora

### Site - polyinthemedia.blogspot.com

In [None]:
# Study the editorial writings of the blogger who has compiled this list of articles.

editorial_corpus = list(posts.find({},{'_id':0, 'editorial_text':1, 'post_date':1}))
# print(editorial_texts[0:5])
editorial_dates = [p['post_date'] for p in editorial_corpus]
editorial_corpus = [p['editorial_text'] for p in editorial_corpus]
editorial_corpus = clean_text(editorial_corpus)
len(editorial_corpus)

In [None]:
# Collect all quoted content from news articles

pipeline = [
    {'$unwind':'$quotes'},
    {'$project': {'_id':0, 'quotes':1, 'post_date':1}}
           ]

quotes_corpus = list(posts.aggregate(pipeline))

quotes_dates = [p['post_date'] for p in quotes_corpus]
quotes_corpus = [p['quotes'] for p in quotes_corpus]
quotes_corpus = clean_text(quotes_corpus)
len(quotes_corpus)

In [None]:
quotes_corpus[3]

In [None]:
def unravel_corpus(level='sentence'):

## Feature Extraction

In [None]:
def extract_text_features(corpus, model=CountVectorizer(stop_words='english', min_df=10)):
    

In [None]:
# Count vectorizer on cleaned text
cv = CountVectorizer(stop_words='english', min_df=10)
X = cv.fit_transform(corpus)
_ = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
df = pd.concat([pd.Series(dates, name='post_date'), _], axis=1)
df.head()

In [None]:
# df.to_csv('../data/exports/editorial_texts_mindf100.csv')

### Lancaster, min-df 40

In [None]:
editorial_texts = list(posts.find({},{'_id':0, 'editorial_text':1, 'post_date':1}))
# print(editorial_texts[0:5])
dates = [p['post_date'] for p in editorial_texts]
corpus = [p['editorial_text'] for p in editorial_texts]
corpus = clean_text(corpus, lancaster=True)

In [None]:
# Count vectorizer on cleaned text
cv = CountVectorizer(stop_words='english', min_df=40)
X = cv.fit_transform(corpus)
_ = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
df = pd.concat([pd.Series(dates, name='post_date'), _], axis=1)
df.head()

In [None]:
df.to_csv('../data/exports/editorial_texts_mindf40_lancaster.csv')

### Ngrams (2)

In [None]:
editorial_texts = list(posts.find({},{'_id':0, 'editorial_text':1, 'post_date':1}))
# print(editorial_texts[0:5])
dates = [p['post_date'] for p in editorial_texts]
corpus = [p['editorial_text'] for p in editorial_texts]
corpus = clean_text(corpus)

In [None]:
# Count vectorizer on cleaned text
cv = CountVectorizer(stop_words='english', min_df=100, ngram_range=(1,5))
X = cv.fit_transform(corpus)
_ = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
df = pd.concat([pd.Series(dates, name='post_date'), _], axis=1)

In [None]:
for col in df.columns:
    print(col)

In [None]:
# Count vectorizer on cleaned text
cv = TF(stop_words='english', min_df=100, ngram_range=(1,5))
X = cv.fit_transform(corpus)
_ = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
df = pd.concat([pd.Series(dates, name='post_date'), _], axis=1)

## Latent Semantic Analysis

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
lsa = TruncatedSVD(10)
doc_topic = lsa.fit_transform(X)
lsa.explained_variance_ratio_

In [None]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             columns = cv.get_feature_names())
topic_word.head()

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa, cv.get_feature_names(), 5)

In [None]:
Vt = pd.DataFrame(doc_topic.round(5))
Vt

## MongoDB Writes

In [None]:
# --------------------------
# Get post labels from HTML
# --------------------------

cursor = posts.find({},{'raw_post_html':1,'post_title':1,'labels':1})
_ = list(cursor)

for s in _:
    _id = s['_id']
    
    # Extract blog post labels
    soup = BeautifulSoup(s['raw_post_html'])
    labels = [l.text for l in soup.select('.blogger-labels a')]
    
    # Update MongoDB
    requests = [UpdateOne(
    {'post_title' : s['post_title']},
    { '$set':
     {
         'labels': labels
     }
    }
    )]
    posts.bulk_write(requests)