In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import PyPDF2
from io import StringIO
import os
import re
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

In [2]:
folder = "D:\\Users\\figohjs\\Documents\\Story\\Data\\MPC"

### udf

In [3]:
#read and extract text from one pdf file using pdfminer
def convertPdfToTxt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    #codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    return text

In [4]:
def getDate(x):
    metaData = re.search('MPC_(.*).pdf', x).group(1)
    date = datetime.strptime(metaData, '%d%b%Y')
    return date

In [59]:
#between -1 (most extreme negative) and +1 (most extreme positive)
def giveSentimentScore(text):
    analyzer = SentimentIntensityAnalyzer()
    sentimentscore = analyzer.polarity_scores(text)
    score = sentimentscore['compound']
    return score

In [5]:
def cleanText(text, dateInfo):
    dateStr = dateInfo.strftime('%d %b %Y')
    firstSentence = re.search('(.*Embargo .*\n)', text).group(1)
    lastSentence = re.search("(\nBank Negara Malaysia\s*\n{}\n[^']*)".format(dateStr), text).group(1)
    #remove first and last sentence from the text
    cleanText = re.sub(firstSentence + '|' + lastSentence, '', text)
    #num of date
    numDate = len(re.findall(dateStr, cleanText))
    #sub \n with ''
    cleanText = re.sub('\n' + '|' + dateStr, ' ', cleanText.strip())
    return (cleanText, numDate)

In [86]:
def buildFeatures(text, numDate):
    #column names list
    colList = ['NumberOfWords', 'NumberPositive', 'NumberNegative', 'SentimentScore']
    
    #calculate number of words
    specialCharList = [':','/', ' ']
    wordList = [i for i in text.split(' ') if i not in specialCharList]
    numWords = len(wordList) + numDate
    
    #calculate number of positive/negative words
    numPositive = 0
    numNegative = 0
    for word in wordList:
        score = giveSentimentScore(word)
        if score > 0:
            numPositive+=1
        elif score < 0:
            numNegative+=1
    
    #get final sentiment score of the text
    sentimentScore = giveSentimentScore(text)
    
    return [numWords, numPositive, numNegative, sentimentScore]

### Debug

In [6]:
#test if can convert metadata of all files into datetime
# for i in os.listdir(folder):
#     try:
#         date = re.search('MPC_(.*).pdf', i).group(1)
#         datetime.strptime(date, '%d%b%Y')
#     except Exception as e:
#         print(e)
#         print(i)

In [33]:
# #test if can get unwanted first, last sentence for all files
# for i in os.listdir(folder):
#     try:
#         date = re.search('MPC_(.*).pdf', i).group(1)
#         date = datetime.strptime(date, '%d%b%Y')
#         text = convertPdfToTxt(folder + '\\' + i)
#         firstSentence = re.search('(.*Embargo .*\n)', text).group(1)
# #         lastSentence = re.search("(\nBank Negara Malaysia\s*\n{}\n[^']*)".format(date.strftime('%d %b %Y')), text).group(1)
#     except Exception as e:
#         print(e)
#         print(i)

In [7]:
textResultList = []
numDateList = []
#test if udf - convertPdfToTxt, getDate and cleanText works for all files
for file in os.listdir(folder):
    try:
        if file not in ['.Rhistory']:
            pdfText = convertPdfToTxt(folder + '\\' + file)
            date = getDate(file)
    #         print(date.strftime('%d %b %Y'))
    #         cleanText(pdfText, date)
            textResult, noDate = cleanText(pdfText, date)
            textResultList.append((textResult, date))
            numDateList.append(noDate)
    except Exception as e:
        print(e)
        print(file)

No /Root object! - Is this really a PDF?
.Rhistory


In [13]:
#just pdf text
filename = "D:\\Users\\figohjs\\Documents\\Story\\Data\\2020-12-08_MPCtext.csv"
df = pd.DataFrame([[i[0],i[1]] for i in textResultList], columns = ['Text', 'MPCDate'])
df.to_csv(filename, index = False)

In [89]:
#test if udf - buildFeatures work for all files
featuresList = []
for no, text in enumerate(textResultList):
    try:
        features = buildFeatures(text[0], noDateList[no])
        #append date row-wise
        features+=[text[1]]
        featuresList.append(features)
    except Exception as e:
        print(e)

In [93]:
dfResult = pd.DataFrame(featuresList)
dfResult.columns = ['NumberOfWords', 'NumberPositive', 'NumberNegative', 'SentimentScore', 'MPCDate']
filename = "D:\\Users\\figohjs\\Documents\\Story\\Data\\2020-110-21_MPCfinalResult.csv"
dfResult.to_csv(filename, index = False)

In [37]:
# file = os.listdir(folder)[1]
file = "MPC_07Sep2016.pdf"
date = getDate(file)
date

datetime.datetime(2016, 9, 7, 0, 0)

In [34]:
text = convertPdfToTxt(folder + '\\' + file)
text

'11/21/2020\n\nPRESS RELEASES\nMonetary Policy Statement\nRef No : 09/16/03 / 07 Sep 2016 /\nEmbargo : Not for publication or broadcast before 1500 hours on Wednesday 07 September 2016\n\n \n\nAt the Monetary Policy Committee (MPC) meeting today, Bank Negara Malaysia decided to\nmaintain the Overnight Policy Rate (OPR) at 3.00 percent.\n\nThe global economy continues to expand at a moderate pace. Growth across the advanced\neconomies has been modest. In Asia, economic activity has been supported by domestic\ndemand amid weaker export growth. While volatility in the international financial markets has\nsubsided, markets remain vulnerable to setbacks and changes in sentiments. Going forward,\ndownside risks to global growth remain high following uncertainty over the growth momentum\nand policy shifts in major economies, and unresolved issues post the EU referendum in the\nUnited Kingdom.\n\nFor Malaysia, growth moderated slightly in the second quarter of the year, following weaker net\ne

In [35]:
firstSentence = re.search('(.*Embargo .*\n)', text).group(1)
firstSentence

'Embargo : Not for publication or broadcast before 1500 hours on Wednesday 07 September 2016\n'

In [38]:
lastSentence = re.search("(\nBank Negara Malaysia\s*\n{}\n[^']*)".format(date.strftime('%d %b %Y')), text).group(1)
lastSentence

'\nBank Negara Malaysia \n07 Sep 2016\n\n© Bank Negara Malaysia, 2016. All rights reserved.\n\n \n\n2/2\n\n\x0c'

In [55]:
#remove first and last sentence from the text
cleanText = re.sub(firstSentence + '|' + lastSentence, '', text)
#num of date
dateStr = date.strftime('%d %b %Y')
numDate = len(re.findall(dateStr, cleanText))
#sub \n with ''
cleanText = re.sub('\n' + '|' + dateStr, ' ', cleanText.strip())
cleanText

'11/21/2020  PRESS RELEASES Monetary Policy Statement Ref No : 09/16/03 /   /     At the Monetary Policy Committee (MPC) meeting today, Bank Negara Malaysia decided to maintain the Overnight Policy Rate (OPR) at 3.00 percent.  The global economy continues to expand at a moderate pace. Growth across the advanced economies has been modest. In Asia, economic activity has been supported by domestic demand amid weaker export growth. While volatility in the international financial markets has subsided, markets remain vulnerable to setbacks and changes in sentiments. Going forward, downside risks to global growth remain high following uncertainty over the growth momentum and policy shifts in major economies, and unresolved issues post the EU referendum in the United Kingdom.  For Malaysia, growth moderated slightly in the second quarter of the year, following weaker net exports and a drawdown in stocks. Domestic demand, however, remained the key driver of growth, with private consumption and 

In [None]:
#features
#number of words, sentiment scores, count of positive/negative words

In [56]:
specialCharList = [':','/', ' ']
wordList = [i for i in cleanText.split(' ') if i not in specialCharList]
numWords = len(wordList)

In [57]:
numWords

433

In [60]:
sentimentScore = giveSentimentScore(cleanText) 

In [63]:
numPositive = 0
numNegative = 0
for word in wordList:
    score = giveSentimentScore(word)
    if score > 0:
        numPositive+=1
    elif score < 0:
        numNegative+=1

In [64]:
numPositive

28

In [65]:
numNegative

14

In [69]:
pd.DataFrame([['1','a','c'], ['2','2','3']])

Unnamed: 0,0,1,2
0,1,a,c
1,2,2,3
