In [1]:
import os
import re
import xml.etree.ElementTree as ET
import json
from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client.stackoverflow
posts = db.posts

In [3]:
def sanitise_xml(s):
    """
    Use BeautifulSoup to extract all text within HTML/XML tags, except those in <code> blocks.
    Remove the line breaks, replace unicode character for space
    
    Parameters
    ----------
    s - UTF-8 string
    
    Returns
    -------
    Cleaned up string    
    """
    soup = BeautifulSoup(s, 'lxml')
    for tag in soup.find_all('code'):
        tag.replace_with('')
        
    return soup.get_text().replace('\n', '').replace('\xa0', ' ')

In [4]:
def preprocess_text(data):
    """
    Clean the text body of the question/answers by removing HTML tags and <code> blocks,
    and split the Tags into a list.
    
    Parameter
    ---------
    data - dictionary, data extracted from an XML line using ElementTree
    
    Returns
    -------
    Processed dictionary
    """
    if 'Body' in data:
        data['Body'] = sanitise_xml(data['Body'])
    
    if 'Tags' in data:
        data['Tags'] = re.findall('\<([^>]+)', data['Tags'])
        
    return data

In [5]:
def insert_posts(filename):
    """
    Preprocess posts in the file and insert into MongoDB database
    """
    new_posts = []
    with open(filename) as f:
        for line in f:
            data = preprocess_text(ET.fromstring(line.strip()).attrib)
            new_posts.append(data)
    result = posts.insert_many(new_posts)    

## Loop over the files and insert into the database
Make sure you check how long it takes. Unfortunately it takes around 90 second per file, which takes about 7.5 hours.

In [6]:
import time

In [7]:
PATH = Path('../data/raw/posts')

In [9]:
init_time = time.time()
for file in PATH.iterdir():
    print(file)
    time0 = time.time()
    insert_posts(file)
    print("Time taken to insert = {} seconds".format(time.time()-time0))
    print("Time taken so far = {} seconds".format(time.time() - init_time))

../data/posts/Posts72
Time taken to insert = 62.35699772834778 seconds
Time taken so far = 62.358773708343506 seconds
../data/posts/Posts9092
Time taken to insert = 58.98280334472656 seconds
Time taken so far = 121.34172558784485 seconds
../data/posts/Posts9083
Time taken to insert = 61.50496959686279 seconds
Time taken so far = 182.8468246459961 seconds
../data/posts/Posts9042
Time taken to insert = 62.247971534729004 seconds
Time taken so far = 245.09529161453247 seconds
../data/posts/Posts9010
Time taken to insert = 66.21701550483704 seconds
Time taken so far = 311.3124649524689 seconds
../data/posts/Posts82
Time taken to insert = 63.30579233169556 seconds
Time taken so far = 374.6184194087982 seconds
../data/posts/Posts9003
Time taken to insert = 83.5098967552185 seconds
Time taken so far = 458.1289904117584 seconds
../data/posts/Posts18
Time taken to insert = 71.34315466880798 seconds
Time taken so far = 529.4723191261292 seconds
../data/posts/Posts9218


  ' that document to Beautiful Soup.' % decoded_markup


Time taken to insert = 21.43980383872986 seconds
Time taken so far = 550.912544965744 seconds
../data/posts/Posts84
Time taken to insert = 64.24312853813171 seconds
Time taken so far = 615.1559956073761 seconds
../data/posts/Posts9181
Time taken to insert = 55.97855854034424 seconds
Time taken so far = 671.1347186565399 seconds
../data/posts/Posts9158
Time taken to insert = 58.63131642341614 seconds
Time taken so far = 729.7663683891296 seconds
../data/posts/Posts9173
Time taken to insert = 58.5084593296051 seconds
Time taken so far = 788.2750000953674 seconds
../data/posts/Posts85
Time taken to insert = 60.10047149658203 seconds
Time taken so far = 848.3756337165833 seconds
../data/posts/Posts9066
Time taken to insert = 58.36885213851929 seconds
Time taken so far = 906.744645357132 seconds
../data/posts/Posts9146
Time taken to insert = 61.91989064216614 seconds
Time taken so far = 968.6647276878357 seconds
../data/posts/Posts21
Time taken to insert = 73.39422178268433 seconds
Time tak

KeyboardInterrupt: 