## Resume Data Parsing
#### Resumes (300MB)


### Processes HTML Resumes into Training Sets





<a id="setup"></a>
### Setup

Let's import the data from an downloaded source.



In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries
import re
import string
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, json
import unicodedata
from collections import Counter

# NLTK library for stop word removal
from nltk.corpus import stopwords

# SK-learn libraries for learning.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import log_loss

from xml.dom import minidom
from bs4 import BeautifulSoup as BS




In [2]:
path_to_json = 'data/'
raw_xml = []


In [4]:
# parse an xml file by name
tree = minidom.parse(path_to_json + 'indeed_com-job_deduped_n_merged_20170315_201357376193103.xml')  
items = tree.getElementsByTagName('raw_html')

for item in items:
    raw_xml.append(item.firstChild.data)

print(len(raw_xml))

50101


In [5]:
for i in range(2):
    print(raw_xml[i])

<div id="resume_body" class="vcard single_form-content">
<div id="basic_info_row" class="last basicInfo-content"><div id="basic_info_cell" class="data_display">
<h1 id="resume-contact" class="fn " itemprop="name">Electrical Controls Engineer</h1>
<h2 id="headline" itemprop="jobTitle">Electrical &amp; Controls Engineer</h2>
<div id="contact_info_container">
<div class="adr" itemprop="address" itemscope itemtype="http://schema.org/PostalAddress"><p id="headline_location" class="locality" itemprop="addressLocality">Phillipsburg, NJ</p></div>
<div class="separator-hyphen">-</div>
</div>
<p id="res_summary" class="summary">To secure a Sr. Electrical &amp; Controls position</p>
<p id="employment_eligibility">Authorized to work in the US for any employer</p>
</div></div>
<div class="section-item workExperience-content">
<div><div class="section_title"><h2>Work Experience</h2></div></div>
<div id="work-experience-items" class="items-container">
<div id="workExperience-EeVtAljMsUOBTllI6yVEOA" c

In [7]:
for i in range(20):
    soup = BS(raw_xml[i], "lxml")
    summary = soup.findAll("p", {"id": "res_summary"})
    if (summary):
        print(summary)


[<p class="summary" id="res_summary">To secure a Sr. Electrical &amp; Controls position</p>]
[<p class="summary" id="res_summary">To obtain a position with a dental practice as a Registered Dental Hygienist.</p>]
[<p class="summary" id="res_summary">To secure a position and contribute to your well-established company's success through the efficient use of my previous experience and skills.</p>]
[<p class="summary" id="res_summary">Edit Job Categories: Customer Service (14 Years experience) Total years experience: 14 Years Company Information Company Name:</p>]
[<p class="summary" id="res_summary">To be employed by a progressive organization that presents the opportunity to function in all capacities of <br/>engineering, manufacturing, installation, and customer service in the paper, metals, food, marine, ship building, or <br/>pollution control industries.</p>]
[<p class="summary" id="res_summary">Self motivated worker with extensive experience in construction &amp; masonry field.Core 

In [3]:
companies = {}
universities = {}
certifications = {}
skills_lump = []
raw_xml = []

number_resumes = 10

def findWorkTitlesCompanies(item):
    job_title_name = ""
    if (item.find('p')):
        paragraph = item.p
        if (paragraph.get('class') and 'work_title' in paragraph.get('class')):
            job_title_name = paragraph.string
            workCompanies = item.findAll("div", {"class": "work_company"})
            for comp in workCompanies:
                if (comp.find('span') and comp.find('span').get('class') and 'bold' in comp.find('span').get('class')):
                    company_name = comp.find("span", {"class": "bold"}).string
                    if (not (company_name in companies)):
                        companies[company_name] = []
                    companies[company_name].append(job_title_name)
                        
# prune out high schools          
def findEducation(item):
    degree_name = ""
    if (item.find('p')):
        paragraph = item.p
        if (paragraph.get('class') and 'edu_title' in paragraph.get('class')):
            degree_name = paragraph.string
            university = item.findAll("div", {"class": "edu_school"})
            for comp in university:
                if (comp.find('span') and comp.find('span').get('class') and 'bold' in comp.find('span').get('class')):
                    university_name = comp.find("span", {"class": "bold"}).string
                    if (university_name and 'high school' not in university_name.lower()):
                        if (not (university_name in universities)):
                            universities[university_name] = []
                        universities[university_name].append(degree_name)
                        

def findCertification(item):
    degree_name = ""
    if (item.find('p')):
        paragraph = item.p
        if (paragraph.get('class') and 'certification_title' in paragraph.get('class')):
            cert_name = paragraph.string
            if (not (cert_name in certifications)):
                certifications[cert_name] = 0
            certifications[cert_name] += 1

def findSkills(item):
    skills = ""
    if (item.find('p')):
        paragraph = item.p
        if (not paragraph.get('class') and paragraph.string != None):
            return paragraph.string + " "        
    return ""
            
    
def writeToFile(name_, data_):
    with open(name_ + '.json', 'w') as outfile:
        json.dump(data_, outfile)

def read_raw_xml_and_push():
    global raw_xml
    print('Starting processing: ' + str(len(raw_xml)) + " resumes.")
    for i in range(len(raw_xml)):
        soup = BS(raw_xml[i], "lxml")
        data_display = soup.findAll("div", {"class": "data_display"})
        for item in data_display:
            findWorkTitlesCompanies(item)
            findEducation(item)
            findCertification(item)
            skills_lump.append(findSkills(item))
            
        if ((i % 500) == 0):
            print("Processed: " + str(i) + " resumes.")

def read_raw_xml_and_push_test():    
    for i in range(number_resumes):
        soup = BS(raw_xml[i], "lxml")
        data_display = soup.findAll("div", {"class": "data_display"})
        for item in data_display:
            findWorkTitlesCompanies(item)
            findEducation(item)
            findCertification(item)
            skills_lump = skills_lump + findSkills(item)

def read_next_xml(filename):
    global raw_xml
    raw_xml = []
    tree = minidom.parse(path_to_json + filename)  
    items = tree.getElementsByTagName('raw_html')
    for item in items:
        raw_xml.append(item.firstChild.data)

    print("Read: " + filename + " with elements:"+ str(len(raw_xml)))

#print(companies)
#print(universities)
#print(certifications)
#print(skills_lump)
                        



In [6]:
companies = {}
universities = {}
certifications = {}
skills_lump = []

# read_next_xml('indeed_com-job_deduped_n_merged_20170315_201357376193103.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_202935841620141.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_201536923698467.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_203123248972746.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_201723769342220.xml')
# read_raw_xml_and_push()

read_next_xml('indeed_com-job_deduped_n_merged_20170315_203311012815474.xml')
read_raw_xml_and_push()
read_next_xml('indeed_com-job_deduped_n_merged_20170315_201906034655968.xml')
read_raw_xml_and_push()
read_next_xml('indeed_com-job_deduped_n_merged_20170315_203459248507509.xml')
read_raw_xml_and_push()
read_next_xml('indeed_com-job_deduped_n_merged_20170315_202047762763754.xml')
read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_203646360307105.xml')
# read_raw_xml_and_push()


writeToFile('companies2', companies)
writeToFile('universities2', universities)
writeToFile('certifications2', certifications)
writeToFile('skills_lump2', skills_lump)

# read_next_xml('indeed_com-job_deduped_n_merged_20170315_203311012815474.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_201906034655968.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_203459248507509.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_202047762763754.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_203646360307105.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_202231085985935.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_203831771961790.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_202422140439578.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_204015128749914.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_202604394716786.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_204200995390445.xml')
# read_raw_xml_and_push()
# read_next_xml('indeed_com-job_deduped_n_merged_20170315_202751428419232.xml')
# read_raw_xml_and_push()


# writeToFile('companies', companies)
# writeToFile('universities', universities)
# writeToFile('certifications', certifications)
# writeToFile('skills_lump', skills_lump)


Read: indeed_com-job_deduped_n_merged_20170315_203311012815474.xml with elements:50154
Starting processing: 50154 resumes.
Processed: 0 resumes.
Processed: 500 resumes.
Processed: 1000 resumes.
Processed: 1500 resumes.
Processed: 2000 resumes.
Processed: 2500 resumes.
Processed: 3000 resumes.
Processed: 3500 resumes.
Processed: 4000 resumes.
Processed: 4500 resumes.
Processed: 5000 resumes.
Processed: 5500 resumes.
Processed: 6000 resumes.
Processed: 6500 resumes.
Processed: 7000 resumes.
Processed: 7500 resumes.
Processed: 8000 resumes.
Processed: 8500 resumes.
Processed: 9000 resumes.
Processed: 9500 resumes.
Processed: 10000 resumes.
Processed: 10500 resumes.
Processed: 11000 resumes.
Processed: 11500 resumes.
Processed: 12000 resumes.
Processed: 12500 resumes.
Processed: 13000 resumes.
Processed: 13500 resumes.
Processed: 14000 resumes.
Processed: 14500 resumes.
Processed: 15000 resumes.
Processed: 15500 resumes.
Processed: 16000 resumes.
Processed: 16500 resumes.
Processed: 17000 

Read: indeed_com-job_deduped_n_merged_20170315_202047762763754.xml with elements:50054
Starting processing: 50054 resumes.
Processed: 0 resumes.
Processed: 500 resumes.
Processed: 1000 resumes.
Processed: 1500 resumes.
Processed: 2000 resumes.
Processed: 2500 resumes.
Processed: 3000 resumes.
Processed: 3500 resumes.
Processed: 4000 resumes.
Processed: 4500 resumes.
Processed: 5000 resumes.
Processed: 5500 resumes.
Processed: 6000 resumes.
Processed: 6500 resumes.
Processed: 7000 resumes.
Processed: 7500 resumes.
Processed: 8000 resumes.
Processed: 8500 resumes.
Processed: 9000 resumes.
Processed: 9500 resumes.
Processed: 10000 resumes.
Processed: 10500 resumes.
Processed: 11000 resumes.
Processed: 11500 resumes.
Processed: 12000 resumes.
Processed: 12500 resumes.
Processed: 13000 resumes.
Processed: 13500 resumes.
Processed: 14000 resumes.
Processed: 14500 resumes.
Processed: 15000 resumes.
Processed: 15500 resumes.
Processed: 16000 resumes.
Processed: 16500 resumes.
Processed: 17000 

In [4]:
read_next_xml('indeed_com-job_deduped_n_merged_20170315_201357376193103.xml')


Read: indeed_com-job_deduped_n_merged_20170315_201357376193103.xml with elements:50101


In [61]:
train_xml_set = []
dev_xml_set = []
test_xml_set = []

train_text_set = []
dev_text_set = []
test_text_set = []

incompleteCount = 0

def get_predicted_values_for_training(xml_snip):
    if (xml_snip):
        
        work_title = xml_snip.find("p", {"class": "work_title"})
        if (work_title):
            work_title = xml_snip.find("p", {"class": "work_title"}).string
        else:
            return []

        work_stayed = False
        work_dates = xml_snip.find("p", {"class": "work_dates"})
        if (work_dates):
            work_dates = xml_snip.find("p", {"class": "work_dates"}).string
            if ("to Present" in work_dates):
                work_stayed = True
        else:
            return []
        
        work_company = xml_snip.find("div", {"class": "work_company"})
        if (work_company):
            work_company = xml_snip.find("div", {"class": "work_company"}).find("span", {"class": "bold"})
            if (work_company):
                work_company = xml_snip.find("div", {"class": "work_company"}).find("span", {"class": "bold"}).string
        else:
            return []

        work_description = xml_snip.find("p", {"class": "work_description"})
        if (work_description):
            work_description = xml_snip.find("p", {"class": "work_description"}).get_text()
            work_description = unicodedata.normalize("NFKD", work_description)
        else:
            work_description = ""

        if (work_title and work_company):
            workable_unit = {}
            workable_unit['work_title'] = str(work_title)
            workable_unit['work_company'] = work_company
#             workable_unit['work_description'] = work_description
            workable_unit['work_stayed'] = work_stayed
            return workable_unit
        else:
            return []
    else:
        return []

def read_raw_xml_and_extract_similar_jobtitle(go_through_all = 0):
    global raw_xml
    global incompleteCount
    global train_xml_set
    global train_text_set

    go_through = go_through_all
    if (not go_through_all):
        go_through = len(raw_xml)
    for i in range(go_through):
        soup = BS(raw_xml[i], "lxml")
        resumes = soup.findAll("div", {"class": "single_form-content"})
        for item in resumes:
            print('Starting processing: ' + str(i+1) + " resumes")
            all_work_exps = item.findAll("div", {"class": "work-experience-section"})
            if (all_work_exps and len(all_work_exps) > 1):
                predictions = get_predicted_jobtitles(all_work_exps)
                if (len(predictions) > 0):
                    # extract and replace current job title
                    first_work_exp = item.find("div", {"class": "work-experience-section"})
                    if (first_work_exp):
                        string_comment = first_work_exp.find("p", {"class": "work_title"})
                        string_comment.replace_with("")
                        unit = {}
                        unit['data'] = str(item)
                        unit['predictions'] = predictions
                        train_xml_set.append(unit)
                        textunit = {}
                        normalized_text = item.get_text()
                        normalized_text = unicodedata.normalize("NFKD", normalized_text)
                        textunit['data'] = normalized_text
                        textunit['predictions'] = predictions
                        train_text_set.append(textunit)

# remove everything after to for first job
# if it has 'to present' they stayed
        
def read_raw_xml_and_extract_first(go_through_all = 0):
    global raw_xml
    global incompleteCount
    global train_xml_set
    global train_text_set

    go_through = go_through_all
    if (not go_through_all):
        go_through = len(raw_xml)
    for i in range(go_through):
        soup = BS(raw_xml[i], "lxml")
        resumes = soup.findAll("div", {"class": "single_form-content"})
        for item in resumes:
            print('Starting processing: ' + str(i+1) + " resumes")
            first_work_exp = item.find("div", {"class": "work-experience-section"})
            predictions = get_predicted_values_for_training(first_work_exp)
            if (len(predictions) > 0):
                # first_work_exp.extract()
                work_dates = first_work_exp.find("p", {"class": "work_dates"})
                if (work_dates):
                    string_comment = first_work_exp.find("p", {"class": "work_dates"}).string
                    sep = ' to '
                    rest = string_comment.split(sep, 1)[0]
                    work_dates.replace_with(rest)
                unit = {}
                unit['data'] = str(item)
                unit['predictions'] = predictions
                train_xml_set.append(unit)
                textunit = {}
                normalized_text = item.get_text()
                normalized_text = unicodedata.normalize("NFKD", normalized_text)
                textunit['data'] = normalized_text
                textunit['predictions'] = predictions
                train_text_set.append(textunit)
            else:
                incompleteCount += 1

def get_predicted_education(xml_snip):
    edu_unit = {}
    edu_unit['master'] = False
    edu_unit['bachelor'] = False
    
    if (xml_snip):
        
        # Bachel or Maste -> <p class="edu_title">
        # college, univer -> <div class="edu_school"><span class="bold" itemprop="name">
        edu_titles = xml_snip.findAll("p", {"class": "edu_title"})
        if (edu_titles and len(edu_titles) > 0):
            for item in edu_titles: # if ("to Present" in work_dates):
                if ('bachel' in item.string.lower()) or ('master' in item.string.lower()) or ('phd' in item.string.lower()):
                    edu_unit['bachelor'] = True
                if ('master' in item.string.lower()) or ('phd' in item.string.lower()):
                    edu_unit['master'] = True
        
        edu_unis = xml_snip.findAll("div", {"class": "edu_school"})
        if (edu_unis):
            edu_unis = xml_snip.findAll("div", {"class": "edu_school"})
            if (edu_unis and len(edu_unis) > 0):
                for item in edu_unis:
                    edu_span = item.find("span", {"class": "bold"})
                    if (edu_span and edu_span.string):
                        if ('universi' in edu_span.string.lower()) or ('college' in edu_span.string.lower()):
                            edu_unit['bachelor'] = True
                        
    return edu_unit

def read_raw_xml_and_extract_education(go_through_all = 0):
    global raw_xml
    global incompleteCount
    global train_xml_set
    global train_text_set

    go_through = go_through_all
    if (not go_through_all):
        go_through = len(raw_xml)
    for i in range(go_through):
        soup = BS(raw_xml[i], "lxml")
        resumes = soup.findAll("div", {"class": "single_form-content"})
        for item in resumes:
            print('Starting processing: ' + str(i+1) + " resumes")
            education = item.find("div", {"id": "education-items"})
            predictions = get_predicted_education(education)
            education_items = item.find("div", {"class": "education-content"})
            if (education_items):
                education_items.extract()
            unit = {}
            unit['data'] = str(item)
            unit['predictions'] = predictions
            train_xml_set.append(unit)
            textunit = {}
            normalized_text = item.get_text()
            normalized_text = unicodedata.normalize("NFKD", normalized_text)
            textunit['data'] = normalized_text
            textunit['predictions'] = predictions
            train_text_set.append(textunit)

def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def get_predicted_jobtitles(past_jobs):
    work_unit = {}
    work_unit['similar_job'] = False
    work_unit['similar_metric'] = 0.0
    work_unit['similar_job_titles'] = []
    
    first_job = ""
    second_job = ""
    
    if (past_jobs and len(past_jobs) > 0):
        for index, item in enumerate(past_jobs):
            string_comment = item.find("p", {"class": "work_title"})
            if (index == 0 and string_comment):
                string_comment = item.find("p", {"class": "work_title"}).string
                if (string_comment and len(string_comment) > 2):
                    first_job = string_comment
            if (index == 1 and string_comment):
                string_comment = item.find("p", {"class": "work_title"}).string
                if (string_comment and len(string_comment) > 2):
                    second_job = string_comment
    
    if (first_job and second_job):
        tokens_first = first_job.lower().replace('[^a-zA-Z ]',' ').split()
        tokens_second = second_job.lower().replace('[^a-zA-Z ]',' ').split()
        for value in tokens_first:
            if value in tokens_second:
                 work_unit['similar_job'] = True
        
        work_unit['similar_metric'] = levenshtein(first_job.lower().replace('[^a-zA-Z ]',' '), second_job.lower().replace('[^a-zA-Z ]',' '))
        work_unit['similar_job_titles'].append(first_job.lower().replace('[^a-zA-Z ]',' '))
        work_unit['similar_job_titles'].append(second_job.lower().replace('[^a-zA-Z ]',' '))

        return work_unit

    return []

def get_predicted_work_experience(past_jobs):
    work_unit = {}
    work_unit['years'] = False
    
    first_job = ""
    second_job = ""
    pattern = re.compile(r'((19|20)[0-9]{2})')
    
    if (past_jobs and len(past_jobs) > 0):
        for index, item in enumerate(past_jobs):
            work_dates = item.find("p", {"class": "work_dates"})
            if (work_dates):
                work_dates = item.find("p", {"class": "work_dates"}).string
                # (19|20)[0-9]{2}
                for (date1, date2) in re.findall(pattern, work_dates):
                    if (int(date1) < 2007):
                        work_unit['years'] = True
            
        return work_unit

    return []

def read_raw_xml_and_extract_work_years(go_through_all = 0):
    global raw_xml
    global incompleteCount
    global train_xml_set
    global train_text_set

    go_through = go_through_all
    if (not go_through_all):
        go_through = len(raw_xml)
    for i in range(go_through):
        soup = BS(raw_xml[i], "lxml")
        resumes = soup.findAll("div", {"class": "single_form-content"})
        for item in resumes:
            print('Starting processing: ' + str(i+1) + " resumes")
            all_work_exps = item.findAll("div", {"class": "work-experience-section"})
            if (all_work_exps and len(all_work_exps) > 1):
                predictions = get_predicted_work_experience(all_work_exps)
                if (len(predictions) > 0):
                    # extract and replace current job title
                    unit = {}
                    unit['data'] = str(item)
                    unit['predictions'] = predictions
#                     print(predictions)
                    train_xml_set.append(unit)
                    textunit = {}
                    normalized_text = item.get_text()
                    normalized_text = unicodedata.normalize("NFKD", normalized_text)
                    textunit['data'] = normalized_text
                    textunit['predictions'] = predictions
                    train_text_set.append(textunit)
                        


In [None]:
read_raw_xml_and_extract_work_years()

Starting processing: 1 resumes
Starting processing: 2 resumes
Starting processing: 3 resumes
Starting processing: 4 resumes
Starting processing: 5 resumes
Starting processing: 6 resumes
Starting processing: 7 resumes
Starting processing: 8 resumes
Starting processing: 9 resumes
Starting processing: 10 resumes
Starting processing: 11 resumes
Starting processing: 12 resumes
Starting processing: 13 resumes
Starting processing: 14 resumes
Starting processing: 15 resumes
Starting processing: 16 resumes
Starting processing: 17 resumes
Starting processing: 18 resumes
Starting processing: 19 resumes
Starting processing: 20 resumes
Starting processing: 21 resumes
Starting processing: 22 resumes
Starting processing: 23 resumes
Starting processing: 24 resumes
Starting processing: 25 resumes
Starting processing: 26 resumes
Starting processing: 27 resumes
Starting processing: 28 resumes
Starting processing: 29 resumes
Starting processing: 30 resumes
Starting processing: 31 resumes
Starting processi

Starting processing: 267 resumes
Starting processing: 268 resumes
Starting processing: 269 resumes
Starting processing: 270 resumes
Starting processing: 271 resumes
Starting processing: 272 resumes
Starting processing: 273 resumes
Starting processing: 274 resumes
Starting processing: 275 resumes
Starting processing: 276 resumes
Starting processing: 277 resumes
Starting processing: 278 resumes
Starting processing: 279 resumes
Starting processing: 280 resumes
Starting processing: 281 resumes
Starting processing: 282 resumes
Starting processing: 283 resumes
Starting processing: 284 resumes
Starting processing: 285 resumes
Starting processing: 286 resumes
Starting processing: 287 resumes
Starting processing: 288 resumes
Starting processing: 289 resumes
Starting processing: 290 resumes
Starting processing: 291 resumes
Starting processing: 292 resumes
Starting processing: 293 resumes
Starting processing: 294 resumes
Starting processing: 295 resumes
Starting processing: 296 resumes
Starting p

Starting processing: 527 resumes
Starting processing: 528 resumes
Starting processing: 529 resumes
Starting processing: 530 resumes
Starting processing: 531 resumes
Starting processing: 532 resumes
Starting processing: 533 resumes
Starting processing: 534 resumes
Starting processing: 535 resumes
Starting processing: 536 resumes
Starting processing: 537 resumes
Starting processing: 538 resumes
Starting processing: 539 resumes
Starting processing: 540 resumes
Starting processing: 541 resumes
Starting processing: 542 resumes
Starting processing: 543 resumes
Starting processing: 544 resumes
Starting processing: 545 resumes
Starting processing: 546 resumes
Starting processing: 547 resumes
Starting processing: 548 resumes
Starting processing: 549 resumes
Starting processing: 550 resumes
Starting processing: 551 resumes
Starting processing: 552 resumes
Starting processing: 553 resumes
Starting processing: 554 resumes
Starting processing: 555 resumes
Starting processing: 556 resumes
Starting p

Starting processing: 801 resumes
Starting processing: 802 resumes
Starting processing: 803 resumes
Starting processing: 804 resumes
Starting processing: 805 resumes
Starting processing: 806 resumes
Starting processing: 807 resumes
Starting processing: 808 resumes
Starting processing: 809 resumes
Starting processing: 810 resumes
Starting processing: 811 resumes
Starting processing: 812 resumes
Starting processing: 813 resumes
Starting processing: 814 resumes
Starting processing: 815 resumes
Starting processing: 816 resumes
Starting processing: 817 resumes
Starting processing: 818 resumes
Starting processing: 819 resumes
Starting processing: 820 resumes
Starting processing: 821 resumes
Starting processing: 822 resumes
Starting processing: 823 resumes
Starting processing: 824 resumes
Starting processing: 825 resumes
Starting processing: 826 resumes
Starting processing: 827 resumes
Starting processing: 828 resumes
Starting processing: 829 resumes
Starting processing: 830 resumes
Starting p

Starting processing: 1053 resumes
Starting processing: 1054 resumes
Starting processing: 1055 resumes
Starting processing: 1056 resumes
Starting processing: 1057 resumes
Starting processing: 1058 resumes
Starting processing: 1059 resumes
Starting processing: 1060 resumes
Starting processing: 1061 resumes
Starting processing: 1062 resumes
Starting processing: 1063 resumes
Starting processing: 1064 resumes
Starting processing: 1065 resumes
Starting processing: 1066 resumes
Starting processing: 1067 resumes
Starting processing: 1068 resumes
Starting processing: 1069 resumes
Starting processing: 1070 resumes
Starting processing: 1071 resumes
Starting processing: 1072 resumes
Starting processing: 1073 resumes
Starting processing: 1074 resumes
Starting processing: 1075 resumes
Starting processing: 1076 resumes
Starting processing: 1077 resumes
Starting processing: 1078 resumes
Starting processing: 1079 resumes
Starting processing: 1080 resumes
Starting processing: 1081 resumes
Starting proce

Starting processing: 1295 resumes
Starting processing: 1296 resumes
Starting processing: 1297 resumes
Starting processing: 1298 resumes
Starting processing: 1299 resumes
Starting processing: 1300 resumes
Starting processing: 1301 resumes
Starting processing: 1302 resumes
Starting processing: 1303 resumes
Starting processing: 1304 resumes
Starting processing: 1305 resumes
Starting processing: 1306 resumes
Starting processing: 1307 resumes
Starting processing: 1308 resumes
Starting processing: 1309 resumes
Starting processing: 1310 resumes
Starting processing: 1311 resumes
Starting processing: 1312 resumes
Starting processing: 1313 resumes
Starting processing: 1314 resumes
Starting processing: 1315 resumes
Starting processing: 1316 resumes
Starting processing: 1317 resumes
Starting processing: 1318 resumes
Starting processing: 1319 resumes
Starting processing: 1320 resumes
Starting processing: 1321 resumes
Starting processing: 1322 resumes
Starting processing: 1323 resumes
Starting proce

Starting processing: 1549 resumes
Starting processing: 1550 resumes
Starting processing: 1551 resumes
Starting processing: 1552 resumes
Starting processing: 1553 resumes
Starting processing: 1554 resumes
Starting processing: 1555 resumes
Starting processing: 1556 resumes
Starting processing: 1557 resumes
Starting processing: 1558 resumes
Starting processing: 1559 resumes
Starting processing: 1560 resumes
Starting processing: 1561 resumes
Starting processing: 1562 resumes
Starting processing: 1563 resumes
Starting processing: 1564 resumes
Starting processing: 1565 resumes
Starting processing: 1566 resumes
Starting processing: 1567 resumes
Starting processing: 1568 resumes
Starting processing: 1569 resumes
Starting processing: 1570 resumes
Starting processing: 1571 resumes
Starting processing: 1572 resumes
Starting processing: 1573 resumes
Starting processing: 1574 resumes
Starting processing: 1575 resumes
Starting processing: 1576 resumes
Starting processing: 1577 resumes
Starting proce

In [None]:
len(train_text_set)


In [None]:
train_xml_set[:5]

In [None]:
import pickle
import sys

sys.setrecursionlimit(20000)

# pickle.dump(train_text_set, open( "data/train_2.p", "wb" ) )

In [25]:
pickle.dump(train_xml_set, open( "data/train_4_xml.p", "wb" ) )