In [16]:
# web scraping
import requests
from bs4 import BeautifulSoup
# First import pickle and convert the text to a pickle file
import pickle
# need to do beautiful soup for better file format
# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find_all('p')]
    print(url)
    return text

urls = ['https://www.gutenberg.org/cache/epub/6356/pg6356-images.html',
        'https://www.gutenberg.org/cache/epub/6530/pg6530-images.html', 
       'https://www.gutenberg.org/cache/epub/6529/pg6529-images.html', 
        'https://www.gutenberg.org/cache/epub/5656/pg5656-images.html'
       ]

# gospels names
gospels = ['Matthew', 'Mark', 'Luke', 'John']


In [17]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://www.gutenberg.org/cache/epub/6356/pg6356-images.html
https://www.gutenberg.org/cache/epub/6530/pg6530-images.html
https://www.gutenberg.org/cache/epub/6529/pg6529-images.html
https://www.gutenberg.org/cache/epub/5656/pg5656-images.html


In [18]:
# # Pickle files for later use

# # Make a new directory to hold the text files
# !mkdir transcripts

for i, g in enumerate(gospels):
    with open("transcripts/" + g + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [19]:
# load pickled files
data = {}
for i, g in enumerate(gospels):
    with open("transcripts/" + g + ".txt", "rb") as file:
        file.seek(0)
        data[g] = pickle.load(file)

In [20]:
data.keys()

dict_keys(['Matthew', 'Mark', 'Luke', 'John'])

In [21]:
data['Matthew'][:2]

['Title: The Gospel of Matthew for Readers', 'Editor: Lightheart']

In [22]:
#cleaning data
next(iter(data.keys()))

'Matthew'

In [23]:
next(iter(data.values()))

['Title: The Gospel of Matthew for Readers',
 'Editor: Lightheart',
 'Release date: May 1, 2004 [eBook #6356]\r\n                Most recently updated: August 21, 2012',
 'Language: English',
 'Copyright (C) 2002 by Lightheart.',
 'The Gospel of Matthew for Readers',
 "Editor's Preface",
 'What is God\'s will for us? The answer can be found in the pages of the\r\nGospel of Matthew, a rich source of information detailing the Son of\r\nGod\'s instructions from Our Father to us. Here we learn, through Jesus\'\r\nteachings and actions, what we must keep of the old. We find what must\r\nbe replaced. We see what has been fulfilled and has come full circle.\r\nWe discover the new material and the new message. The Gospel of Matthew\r\nholds out the keys to "this generation\'s" entry into the kingdom of\r\nheaven … if we have ears to hear and eyes to see.',
 'This edition of the Gospel of Matthew is designed for easy reading and\r\nis presented in chapter and paragraph style. The first chapter 

In [24]:
#change format to key: gospel, value: string
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [25]:
#combine
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [26]:
#pandas pandas pandas show
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcripts']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcripts
John,"Title: The Gospel of John for Readers Editor: Lightheart Release date: May 1, 2004 [eBook #5656]\r\n Most recently updated: August ..."
Luke,"Title: The Gospel of Luke for Readers Editor: Lightheart Release date: September 1, 2004 [eBook #6529]\r\n Most recently updated: A..."
Mark,"Title: The Gospel of Mark for Readers Editor: Lightheart Release date: September 1, 2004 [eBook #6530]\r\n Most recently updated: A..."
Matthew,"Title: The Gospel of Matthew for Readers Editor: Lightheart Release date: May 1, 2004 [eBook #6356]\r\n Most recently updated: Augu..."


In [27]:

data_df.transcripts.loc['Matthew']

'Title: The Gospel of Matthew for Readers Editor: Lightheart Release date: May 1, 2004 [eBook #6356]\r\n                Most recently updated: August 21, 2012 Language: English Copyright (C) 2002 by Lightheart. The Gospel of Matthew for Readers Editor\'s Preface What is God\'s will for us? The answer can be found in the pages of the\r\nGospel of Matthew, a rich source of information detailing the Son of\r\nGod\'s instructions from Our Father to us. Here we learn, through Jesus\'\r\nteachings and actions, what we must keep of the old. We find what must\r\nbe replaced. We see what has been fulfilled and has come full circle.\r\nWe discover the new material and the new message. The Gospel of Matthew\r\nholds out the keys to "this generation\'s" entry into the kingdom of\r\nheaven … if we have ears to hear and eyes to see. This edition of the Gospel of Matthew is designed for easy reading and\r\nis presented in chapter and paragraph style. The first chapter begins\r\nwith the birth of Jesu

In [28]:
# string cleaning
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [31]:
# check the updated text

data_clean = pd.DataFrame(data_df.transcripts.apply(round1))
data_clean

Unnamed: 0,transcripts
John,title the gospel of john for readers editor lightheart release date may \r\n most recently updated august language english copy...
Luke,title the gospel of luke for readers editor lightheart release date september \r\n most recently updated august language englis...
Mark,title the gospel of mark for readers editor lightheart release date september \r\n most recently updated august language englis...
Matthew,title the gospel of matthew for readers editor lightheart release date may \r\n most recently updated august language english c...


In [32]:
# more cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [33]:
#check updated text
data_clean = pd.DataFrame(data_clean.transcripts.apply(round2))
data_clean

Unnamed: 0,transcripts
John,title the gospel of john for readers editor lightheart release date may \r most recently updated august language english copyri...
Luke,title the gospel of luke for readers editor lightheart release date september \r most recently updated august language english ...
Mark,title the gospel of mark for readers editor lightheart release date september \r most recently updated august language english ...
Matthew,title the gospel of matthew for readers editor lightheart release date may \r most recently updated august language english cop...


In [52]:
def clean_text_round3(text):
    '''Get rid of \r.'''
    text = re.sub('\r', ' ', text)
    return text

round3 = lambda x: clean_text_round3(x)

In [53]:
# check updated text
data_clean = pd.DataFrame(data_clean.transcripts.apply(round3))
data_clean

Unnamed: 0,transcripts
John,title the gospel of john for readers editor lightheart release date may most recently updated august language english copyrigh...
Luke,title the gospel of luke for readers editor lightheart release date september most recently updated august language english co...
Mark,title the gospel of mark for readers editor lightheart release date september most recently updated august language english co...
Matthew,title the gospel of matthew for readers editor lightheart release date may most recently updated august language english copyr...


In [54]:
data_df

Unnamed: 0,transcripts
John,"Title: The Gospel of John for Readers Editor: Lightheart Release date: May 1, 2004 [eBook #5656]\r\n Most recently updated: August ..."
Luke,"Title: The Gospel of Luke for Readers Editor: Lightheart Release date: September 1, 2004 [eBook #6529]\r\n Most recently updated: A..."
Mark,"Title: The Gospel of Mark for Readers Editor: Lightheart Release date: September 1, 2004 [eBook #6530]\r\n Most recently updated: A..."
Matthew,"Title: The Gospel of Matthew for Readers Editor: Lightheart Release date: May 1, 2004 [eBook #6356]\r\n Most recently updated: Augu..."


In [55]:
# pickle for later use
data_df.to_pickle("corpus.pkl")

In [56]:
# document-term matrix creation
# this may be running into words that aren't actually names and pulling them out
# should be only a minor 
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcripts)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abased,abasedand,abba,abel,abelto,abia,abide,abidein,abideth,abidethere,...,youyou,zabulon,zacchaeus,zacchaeuswanted,zacharias,zeal,zebedee,zebedees,zelotes,zorobabel
John,0,0,0,0,0,0,9,1,4,0,...,0,0,0,0,0,1,0,0,0,0
Luke,2,0,0,1,0,1,2,0,0,0,...,1,0,3,1,8,0,0,0,1,1
Mark,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,3,0,0,0
Matthew,0,1,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,3,1,0,0


In [59]:
#pickle for later
data_dtm.to_pickle("dtm.pkl")

In [60]:
# pickle cleaned data
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))