## Imports

In [3]:
from bs4 import BeautifulSoup
import requests
import re
import sys
import string
import json
from ast import literal_eval
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from dateutil.parser import parse
import pdb
from pymongo import MongoClient
from pprint import pprint

## Initialize Mongodb

In [4]:
client = MongoClient()
db = client.polymedia
print("Collections: ",db.list_collection_names())

Collections:  ['subreddit_polyamory', 'temp', 'pitm', 'test']


## Scraping Utility Functions

In [5]:
def escape_ansi(line):
    ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]+')
    return ansi_escape.sub('', line)


def make_soup(url):
    webpage_response = requests.get(url)
    return BeautifulSoup(webpage_response.content, 'html.parser')


def get_all_a_hrefs(soup, selector=''):
    """
    Grabs all href attribute values from a tags
    contained inside given selector.
    """
    links = []
    if len(selector) > 0:
        prefix = selector + ' '
    else:
        prefix = ''
    for a in soup.select(prefix + 'a', href=True):
        links += [a['href']]
    return list(set(links))


def is_link_date_archive(link):
    """
    Identifies links with potential date value in url path.
    """
    # print(link)
    # print(re.search('.+\/[0-9]{4}\/[0-9]{2}\/', link))
    if re.search('.+[0-9]{4}/[0-9]{2}.+', link) is None:
        return False
    else:
        return True


def list_outbound_links(page_url, domain):
    soup = make_soup(page_url)
    links = get_all_a_hrefs(soup)
    return [link for link in links if not((re.search('.+domain.+', link)))]

def get_post_permalinks(archive_url):
    soup = make_soup(archive_url)
    permalinks = soup.select('a[title="permanent link"]')
    return [p['href'] for p in permalinks]

def scrape_posts(permalinks):
    
    cols = []

    # Iterate through posts and create dictionary:
    for p in permalinks:
        
        soup = make_soup(p)
        date_header = soup.select('.date-header')[0]
        post = soup.select('.post')[0]
        
        try:
            comments = soup.select('#comments')[0]
            num_comments = int(comments.select('h4')[0].get_text().split()[0])
            comment_blocks = [c.text.strip()
                      for c in post.find_all(".comment-body")]
        except:
            num_comments = 0
            comment_blocks = []

        post_col = {}

        # Save raw HTML for later, just in case:
        post_col['raw_post_html'] = str(post)
        post_col['raw_comments_html'] = str(comments)
        
        # Save comments:
        post_col['comments'] = comment_blocks
        
        # Post meta:
        post_col['num_comments'] = num_comments
        post_col['post_date_string'] = date_header.get_text()
        post_col['post_date'] = parse(date_header.get_text())
        post_col['post_title'] = post.select('h3.post-title')[0].get_text().strip()

        # Collect all blockquotes from news sources:
        quotes = post.find_all("blockquote")
        quote_texts = [q.get_text().strip()
                  for q in quotes]
        post_col['quotes'] = quote_texts
        

        
        # Remove quotes from the main HTML, leaving blog author's commentary:
        for q in quotes:
            q.extract()
        
        post_col['editorial_text'] = post.text

        cols += [post_col]

    return cols

## Grab All Post URLs from Blog Home Directory, add to MongoDB

In [6]:
# Use homepage as directory.
blog_home = "https://polyinthemedia.blogspot.com/"
blog_domain = "polyinthemedia.blogspot.com"
blog_soup = make_soup(blog_home)

In [7]:
archive_links = get_all_a_hrefs(blog_soup, '.archive-list')
month_archives = [x for x in archive_links if is_link_date_archive(x)]
test_month_link = month_archives[0]
print(test_month_link)

http://polyinthemedia.blogspot.com/2017/09/


In [8]:
all_scraped_posts = []
for i, month_archive in enumerate(month_archives):
    print(f'scraping month archive #{i} of 171')
    permalinks = get_post_permalinks(month_archive)
    print('sample permanlink', permalinks[0])
    print(f"{len(permalinks)} articles to scrape in archive")
    cols = scrape_posts(permalinks)
    print('done with month archive\n')
    all_scraped_posts.extend(cols)

scraping month archive #0 of 171
sample permanlink http://polyinthemedia.blogspot.com/2017/09/cosmopolitan-follows-up-7-signs-you.html
10 articles to scrape in archive
done with month archive

scraping month archive #1 of 171
sample permanlink http://polyinthemedia.blogspot.com/2007/06/five-wives-of-maurice-pinder.html
7 articles to scrape in archive
done with month archive

scraping month archive #2 of 171
sample permanlink http://polyinthemedia.blogspot.com/2015/06/jessica-bennett-talks-back-to-roberts.html
13 articles to scrape in archive
done with month archive

scraping month archive #3 of 171
sample permanlink http://polyinthemedia.blogspot.com/2015/01/at-home-with-polyamory-profile-of.html
9 articles to scrape in archive
done with month archive

scraping month archive #4 of 171
sample permanlink http://polyinthemedia.blogspot.com/2012/03/love-without-boundaries-revolutionary.html
7 articles to scrape in archive
done with month archive

scraping month archive #5 of 171
sample per

done with month archive

scraping month archive #43 of 171
sample permanlink http://polyinthemedia.blogspot.com/2016/01/more-brazil-stories-polyamory.html
13 articles to scrape in archive
done with month archive

scraping month archive #44 of 171
sample permanlink http://polyinthemedia.blogspot.com/2011/02/is-poly-like-big-hearty-buffet.html
15 articles to scrape in archive
done with month archive

scraping month archive #45 of 171
sample permanlink http://polyinthemedia.blogspot.com/2013/10/in-which-our-side-knocks-nasty-pitch.html
11 articles to scrape in archive
done with month archive

scraping month archive #46 of 171
sample permanlink http://polyinthemedia.blogspot.com/2019/05/are-we-moving-towards-society-where.html
8 articles to scrape in archive
done with month archive

scraping month archive #47 of 171
sample permanlink http://polyinthemedia.blogspot.com/2015/09/comic-5-radical-ways-people-do-non.html
11 articles to scrape in archive
done with month archive

scraping month ar

sample permanlink http://polyinthemedia.blogspot.com/2012/09/q-with-polyamory-producer-natalia-garcia.html
10 articles to scrape in archive
done with month archive

scraping month archive #87 of 171
sample permanlink http://polyinthemedia.blogspot.com/2007/04/liberal-utah-polygamists.html
6 articles to scrape in archive
done with month archive

scraping month archive #88 of 171
sample permanlink http://polyinthemedia.blogspot.com/2018/04/so-have-we-arrived-yet.html
11 articles to scrape in archive
done with month archive

scraping month archive #89 of 171
sample permanlink http://polyinthemedia.blogspot.com/2009/06/more-poly-em-portugues.html
10 articles to scrape in archive
done with month archive

scraping month archive #90 of 171
sample permanlink http://polyinthemedia.blogspot.com/2016/08/says-rt-right-to-free-love-activists.html
9 articles to scrape in archive
done with month archive

scraping month archive #91 of 171
sample permanlink http://polyinthemedia.blogspot.com/2012/10/th

done with month archive

scraping month archive #130 of 171
sample permanlink http://polyinthemedia.blogspot.com/2013/01/more-poly-college-voices.html
11 articles to scrape in archive
done with month archive

scraping month archive #131 of 171
sample permanlink http://polyinthemedia.blogspot.com/2014/08/all-39-nonfiction-books-on-modern.html
12 articles to scrape in archive
done with month archive

scraping month archive #132 of 171
sample permanlink http://polyinthemedia.blogspot.com/2007/10/transqueer-triad-meets-tabloid-diet-mag.html
6 articles to scrape in archive
done with month archive

scraping month archive #133 of 171
sample permanlink http://polyinthemedia.blogspot.com/2017/04/making-gig-out-of-chronicling-your-poly.html
6 articles to scrape in archive
done with month archive

scraping month archive #134 of 171
sample permanlink http://polyinthemedia.blogspot.com/2019/03/on-opposite-side-of-earth-is-polyamory.html
7 articles to scrape in archive
done with month archive

scrap

In [115]:
unsouped = []
for post in all_scraped_posts:
    post['raw_post_html'] = str(post['raw_post_html'])
    post['raw_comments_html'] = str(post['raw_comments_html'])
    unsouped += [post]

In [116]:
pd.DataFrame(unsouped).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 9 columns):
raw_post_html        1470 non-null object
raw_comments_html    1470 non-null object
num_comments         1470 non-null int64
post_date_string     1470 non-null object
post_date            1470 non-null datetime64[ns]
post_title           1470 non-null object
quotes               1470 non-null object
editorial_text       1470 non-null object
_id                  1470 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 103.5+ KB


In [9]:
len(all_scraped_posts)

1470

In [10]:
all_scraped_posts[0]

{'raw_post_html': '<div class="post"><a name="2413074731950245844"></a>\n<h3 class="post-title">\n                      \t \n                      \t Cosmopolitan follows up: "7 Signs You Might Be Hard-Wired for Monogamy"  \n\t                       \n                          </h3>\n<div class="post-body">\n<p><div style="clear:both;"></div><br>\n"Relationship choice" is the theme that the poly movement\'s activists, movers, and shakers firmly adopted several years ago as their guiding star. Poly is right for some, monogamy is right for others; no shame. The crucial thing is to figure out what\'s right for you, and look for partners who actually match.<br>\n<br>\nWhich means telling about yourself, and asking a new person about their feelings on the matter, very early. And explain exactly what you mean by polyamory, and what version of it draws you, because people often have their own assumptions and misunderstandings of the word.<br>\n<br>\nBut first you need to figure <i>yourself </

In [8]:
d = scrape_posts(['https://polyinthemedia.blogspot.com/2012/08/modern-poly-gaining-traction-in-south.html'])[0]
d['editorial_text']

'Modern poly gaining traction in South Africa\n\t                       \n                          \n\nCity Press (South Africa)The small community of polyfolks in South Africa, a country of 49 million, has shown how dedicated people can spread a compelling idea.Their last media splash was in the South African edition of Cosmopolitan. Now City Press, a mass newspaper (readership 2.5 million) aimed at the country\'s 80% black population, presents a fine poly-profile article by the same writer. It covers all the right bases.Modern, egalitarian, gender-neutral polyamory is a noteworthy introduction to Africa for its contrasts to traditional patriarchal polygamy, an ancient fact of life in many regions. South African President Jacob Zuma, for instance, has four wives. This is legal and is generally considered acceptable or only mildly embarrassing.South Africa is the only country on the continent with a visible polyamory movement as far as I can tell.Read the whole article (July 29, 2012)

In [10]:
d = scrape_posts(['https://polyinthemedia.blogspot.com/2012/08/modern-poly-gaining-traction-in-south.html'])[0]
d['editorial_text']

'\n\n                      \t \n                      \t Modern poly gaining traction in South Africa\n\t                       \n                          \n\nCity Press (South Africa)The small community of polyfolks in South Africa, a country of 49 million, has shown how dedicated people can spread a compelling idea.Their last media splash was in the South African edition of Cosmopolitan. Now City Press, a mass newspaper (readership 2.5 million) aimed at the country\'s 80% black population, presents a fine poly-profile article by the same writer. It covers all the right bases.Modern, egalitarian, gender-neutral polyamory is a noteworthy introduction to Africa for its contrasts to traditional patriarchal polygamy, an ancient fact of life in many regions. South African President Jacob Zuma, for instance, has four wives. This is legal and is generally considered acceptable or only mildly embarrassing.South Africa is the only country on the continent with a visible polyamory movement as 

## Load into MongoDB

In [14]:
db.list_collection_names()

['subreddit_polyamory',
 'subreddit_relationships',
 'temp',
 'pitm_2',
 'pitm',
 'test']

In [15]:
db.pitm_2.insert_many(all_scraped_posts)

<pymongo.results.InsertManyResult at 0x12ca56c30>