# DSCI 614 Text Mining
# Project 3: Tax Efficient Withdrawal Strategies
# By Tony Nguyen

## --------------------------------------------------------------------------

## Extract the Text from  PDF Files

In [1]:
# Load the required library into memory
import PyPDF2
import re
# Specify file
mypdf = './tax-efficient-withdrawal-strategies.pdf'
# Creating a pdf file object
pdfFile = open(mypdf, 'rb')
  
# Creating a pdf reader object
pdfFileReader = PyPDF2.PdfReader(pdfFile)

# Get the number of pages in the pdf file
pageCount = len(pdfFileReader.pages)
# printing number of pages in pdf file
print(f' There are {pageCount} pages in the file :{mypdf}')
  
output = []    
for i in range(pageCount):
    # Get the i-th page contents from the pdf file
    pdfPage = pdfFileReader.pages[i]
    # Extract text from each page and append it to the list
    output.append(pdfPage.extract_text())
    
# Concatenate items in the list to a single string
alltexts = ' '.join(output)

# Print out  the first 300 chars from the texts
print("----" * 25)
print(alltexts[:300])
print("----" * 25)  

# Remove \n from the texts
alltexts = re.sub('\n', '', alltexts)
# Remove punctuation from the texts
alltexts = re.sub(r'[^\w\s]','',alltexts)

# Print out  the first 300 chars from the texts
print("----" * 25)
print(alltexts[:300])
print("----" * 25)  
# closing the pdf file object
pdfFile.close()

 There are 17 pages in the file :./tax-efficient-withdrawal-strategies.pdf
----------------------------------------------------------------------------------------------------
1
T. ROWE  PRICE INSIGHTS
ON RETIREMENT
KEY INSIGHTS
	■There are alternatives to the conventional strategy of drawing on a taxable 
account first, followed by tax-deferred, and then Roth accounts. 
	■Many people can take advantage of income in a low tax bracket or tax-free 
capital gains.
	■If plann
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
1T ROWE  PRICE INSIGHTSON RETIREMENTKEY INSIGHTS	There are alternatives to the conventional strategy of drawing on a taxable account first followed by taxdeferred and then Roth accounts 	Many people can take advantage of income in a low tax bracket or taxfree capital gains	If planning to leave an es
--------------------

## Extract all the tokens from the texts

In [2]:
import spacy
# Load the model
nlp = spacy.load("en_core_web_sm")
# Process the texts
doc = nlp(alltexts)
# Print out the first 50 tokens
for token in doc[:50]:
    print(f'Token text = {token.text}; Is the token lowercase? {token.is_lower}; Does the token consist of digits? {token.is_digit} ')

Token text = 1; Is the token lowercase? False; Does the token consist of digits? True 
Token text = T; Is the token lowercase? False; Does the token consist of digits? False 
Token text = ROWE; Is the token lowercase? False; Does the token consist of digits? False 
Token text =  ; Is the token lowercase? False; Does the token consist of digits? False 
Token text = PRICE; Is the token lowercase? False; Does the token consist of digits? False 
Token text = INSIGHTSON; Is the token lowercase? False; Does the token consist of digits? False 
Token text = RETIREMENTKEY; Is the token lowercase? False; Does the token consist of digits? False 
Token text = INSIGHTS; Is the token lowercase? False; Does the token consist of digits? False 
Token text = 	; Is the token lowercase? False; Does the token consist of digits? False 
Token text = There; Is the token lowercase? False; Does the token consist of digits? False 
Token text = are; Is the token lowercase? True; Does the token consist of digits? 

## Extract all lemmas from the texts

In [3]:
import spacy
# Load the model
nlp = spacy.load("en_core_web_sm")
# Process the texts
doc = nlp(alltexts)
# Print out the first 50 tokens
for token in doc[:50]:
    print(f'Token text = {token.text}; The lemma ={token.lemma_} ')

Token text = 1; The lemma =1 
Token text = T; The lemma =t 
Token text = ROWE; The lemma =ROWE 
Token text =  ; The lemma =  
Token text = PRICE; The lemma =price 
Token text = INSIGHTSON; The lemma =INSIGHTSON 
Token text = RETIREMENTKEY; The lemma =RETIREMENTKEY 
Token text = INSIGHTS; The lemma =INSIGHTS 
Token text = 	; The lemma =	 
Token text = There; The lemma =there 
Token text = are; The lemma =be 
Token text = alternatives; The lemma =alternative 
Token text = to; The lemma =to 
Token text = the; The lemma =the 
Token text = conventional; The lemma =conventional 
Token text = strategy; The lemma =strategy 
Token text = of; The lemma =of 
Token text = drawing; The lemma =draw 
Token text = on; The lemma =on 
Token text = a; The lemma =a 
Token text = taxable; The lemma =taxable 
Token text = account; The lemma =account 
Token text = first; The lemma =first 
Token text = followed; The lemma =follow 
Token text = by; The lemma =by 
Token text = taxdeferred; The lemma =taxdeferre

## Remove all the default stop words in SpaCy from the texts.

In [4]:
import spacy

nlp = spacy.load('en_core_web_sm')

spacy_stopwords = nlp.Defaults.stop_words

print(spacy_stopwords)

{'besides', 'more', 'never', 'wherever', 'using', 'upon', 'i', 'whenever', '‘ve', 'front', 'thence', 'twenty', 'afterwards', 'seemed', 'whose', 'about', 'former', 'all', 'been', 'myself', 'hereafter', '‘m', 'nevertheless', 'out', 'fifteen', 'perhaps', 'elsewhere', '‘ll', '’s', 'whereas', 'indeed', 'nowhere', 'alone', 'ourselves', 'am', 'onto', 'since', 'which', 'unless', 'yet', "'s", 'side', 'whence', 'moreover', 'many', '‘d', 'whereafter', 'beforehand', 'during', '’d', 'further', 'its', 'also', 'amongst', 'becomes', 'this', 'under', 'most', 'no', 'can', 'least', '’ve', 'next', 'her', 'else', 'done', 'those', 'some', 'hence', 'ten', 'anything', 'regarding', 'they', 'move', 'towards', 'various', 'would', 'an', 'show', "'ve", 'what', 'seem', 'why', 'either', 'meanwhile', 'did', 'whom', 'nothing', 'was', 'at', 'their', 'everywhere', 'without', 'being', 'behind', 'thereupon', 'whole', 'several', 'seems', 'none', 'put', 'could', 'and', 'sixty', 'both', 'his', 'across', 'somewhere', 'through

## Customize the stop words in SpaCy by:

### a. Adding "tax" and "account" to the stop words.

In [5]:
print(f'There are {len(nlp.Defaults.stop_words)} stop words in Spacy')
# Specify the user defined stop words
customized_stop_words = ['tax', 'account']
# Add the user specified stop words to the Spacy default stop words
for token in customized_stop_words:
    nlp.Defaults.stop_words.add(token)

# Set the tag of the customized stop words as stop word 
for token in customized_stop_words:
    nlp.vocab[token].is_stop = True
print(f'There are {len(nlp.Defaults.stop_words)} stop words in Spacy')

There are 326 stop words in Spacy
There are 328 stop words in Spacy


### b. Remove "full" from the default stop words.

In [6]:
print(f'There are {len(nlp.Defaults.stop_words)} stop words in Spacy')
# Remove the the specified words from the default stop words of Spacy
remove_stop_words = ['full']
# Remove the user specified stop words from the Spacy default stop words
for token in remove_stop_words:
    nlp.Defaults.stop_words.remove(token)

# Set the tag of the removed stop words as non-stop word 
for token in remove_stop_words:
    nlp.vocab[token].is_stop = False
print(f'There are {len(nlp.Defaults.stop_words)} stop words in Spacy')

There are 328 stop words in Spacy
There are 327 stop words in Spacy


### c. Remove all the customized default stop words from the texts.

In [7]:
doc = nlp(alltexts)

# Get the new stop words
spacy_stopwords = nlp.Defaults.stop_words

tokens_without_stopword= [token for token in doc if not token.text in spacy_stopwords]

print(tokens_without_stopword[:250])

[1, T, ROWE,  , PRICE, INSIGHTSON, RETIREMENTKEY, INSIGHTS, 	, There, alternatives, conventional, strategy, drawing, taxable, followed, taxdeferred, Roth, accounts, 	, Many, people, advantage, income, low, bracket, taxfree, capital, gains, 	, If, planning, leave, estate, heirs, consider, assets, ultimately, maximize, aftertax, value, How, Get, More, Out, Your, Retirement, Account, Withdrawals, These, approaches, extend, life, portfolio, preserve, assets, heirsMany, people, rely, largely, Social, Security, benefits, taxdeferred, accounts, individual, retirement, accounts, IRAs, 401k, plans, support, lifestyle, retirement, However, sizable, number, retirees, enter, retirement, assets, taxable, accounts, brokerage, accounts, Roth, accounts, Deciding, use, combination, accounts, fund, spending, decision, likely, driven, consequences, distributions, withdrawals, accounts, different, characteristics, Figure, 1,  , Appendix, 1A, commonly, recommended, approach, conventional, wisdom, withdraw,

## Perform the part of speech tagging for the texts.

In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")
# Process the texts
doc = nlp(alltexts)

# Summarize the first 20 tokens
for token in doc[:20]:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

1 1 NUM CD nummod d False False
T t NOUN NN compound X True False
ROWE ROWE PROPN NNP compound XXXX True False
    SPACE _SP dep   False False
PRICE price NOUN NN compound XXXX True False
INSIGHTSON INSIGHTSON PROPN NNP compound XXXX True False
RETIREMENTKEY RETIREMENTKEY PROPN NNP compound XXXX True False
INSIGHTS INSIGHTS PROPN NNP nsubj XXXX True False
	 	 SPACE _SP dep 	 False False
There there PRON EX expl Xxxxx True True
are be VERB VBP ROOT xxx True True
alternatives alternative NOUN NNS attr xxxx True False
to to ADP IN prep xx True True
the the DET DT det xxx True True
conventional conventional ADJ JJ amod xxxx True False
strategy strategy NOUN NN pobj xxxx True False
of of ADP IN prep xx True True
drawing draw VERB VBG pcomp xxxx True False
on on ADP IN prep xx True True
a a DET DT det x True True


## Visualize the dependency parser of the texts.

In [9]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Many people can take advantage of income in a low tax bracket or taxfree capital gains")
# Visualize it by seeting style to be "dep" and jupyter to be True
displacy.render(doc, style="dep", jupyter = True)

## Perform the named entities recognition for the texts.

In [10]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(alltexts)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

1 0 1 CARDINAL
ROWE 3 7 ORG
first 133 138 ORDINAL
Roth 172 176 PERSON
Social Security 556 571 ORG
401k 654 658 PRODUCT
Roth 833 837 PERSON
1 1062 1063 CARDINAL
Appendix 1A 1069 1080 PRODUCT
first 1184 1189 ORDINAL
Roth 1235 1239 PERSON
first 1318 1323 ORDINAL
Leaving Roth 1387 1399 PERSON
three 1728 1733 CARDINAL
Roger A 1897 1904 PERSON
Retirees With Relatively Modest Income 1972 2010 ORG
5   2011 2014 CARDINAL
6 2064 2065 CARDINAL
the SECURE Act9  Other Observations and Considerations 2133 2187 ORG
first 2234 2239 ORDINAL
two 2240 2243 CARDINAL
third 2475 2480 ORDINAL
three 2777 2782 CARDINAL
16 2862 2864 CARDINAL
Social Security 2970 2985 ORG
gains41 Generally 3365 3382 ORG
age 59½ 3397 3404 DATE
Roth 3409 3413 PERSON
at least 5 3440 3450 CARDINAL
longevity3 3648 3658 PERSON
January 1 2020 3693 3707 DATE
2025 3780 3784 DATE
2021 3882 3886 DATE
reflected4 3950 3960 PERSON
Dammon Robert M Chester S Spatt 4099 4130 PERSON
Harold H Zhang 4135 4149 PERSON
TaxDeferred Investing 2004 4205 

## Visualize the MONEY  and QUANTITY  in the texts.

In [11]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(alltexts)
# set MONEY to mark and color to use
options = {"ents": ['MONEY', 'QUANTITY'], "colors": {'MONEY': '#fc03db', 'QUANTITY': '#03fc88'}}
## Visualize it by seeting style to be "ent" and jupyter to be True and the corresponding options
displacy.render(doc, style="ent", jupyter = True, options=options)