In [1]:
#First step was to install the Tika package. I originally encountered a PackagesNotFoundError, which I was able to fix 
#by adding a new channel called conda-forge. The command for that is:
#conda config --append channels conda-forge

In [2]:
import tika
import csv
import glob
import os
import nltk
import gensim
from tika import parser
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thomasm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Create corpus (list) to hold transcripts
corpus = []

# Iterate over .pdf files and add to corpus
os.chdir('C:/Users/thomasm/data science/scraping/pdf/transcripts')
for file in list(glob.iglob('*.pdf')):
    # Use Tika to parse the PDF
    parsedPDF = parser.from_file(file)
    # Extract the text content from the parsed PDF
    pdf = parsedPDF["content"]
    # Convert double newlines into single newlines
    pdf = pdf.replace('\n\n', '\n')
    corpus.append(pdf)

In [6]:
from gensim.utils import tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

# set of all of stop words
stop_words = set(stopwords.words('english'))
# set of punctuation 
punctuation = set(string.punctuation)

# stemmer object
stemmer = PorterStemmer()

def clean(doc):
    # Iterates through each word and selects every word that is not a stop word
    stop_free = ' '.join([term for term in doc.lower().split() if term not in stop_words])
    
    # Iterates through every character and selects every character that is not punctuation
    punc_free = ''.join(term for term in stop_free if term not in punctuation)
    
    # Iterates through and stems each word 
    stemmed = ' '.join(stemmer.stem(word) for word in punc_free.split())
    
    return stemmed

# clean every document in the corpus
clean_corpus = [clean(doc).split() for doc in corpus]

In [7]:
clean_corpus[:10]

[['copyright',
  '©',
  '2019',
  'sp',
  'global',
  'market',
  'intellig',
  'divis',
  'sp',
  'global',
  'inc',
  'right',
  'reserv',
  'spglobalcommarketintellig',
  '1',
  'genworth',
  'financi',
  'inc',
  'nysegnw',
  'shareholderanalyst',
  'call',
  'thursday',
  'decemb',
  '13',
  '2018',
  '200',
  'pm',
  'gmt',
  'httpsmarketintelligencespglobalcom',
  'httpsmarketintelligencespglobalcom',
  'content',
  'copyright',
  '©',
  '2019',
  'sp',
  'global',
  'market',
  'intellig',
  'divis',
  'sp',
  'global',
  'inc',
  'right',
  'reserv',
  'spglobalcommarketintellig',
  '2',
  'tabl',
  'content',
  'call',
  'particip',
  '3',
  'present',
  '4',
  'httpsmarketintelligencespglobalcom',
  'httpsmarketintelligencespglobalcom',
  'genworth',
  'financi',
  'inc',
  'shareholderanalyst',
  'call',
  'dec',
  '13',
  '2018',
  'copyright',
  '©',
  '2019',
  'sp',
  'global',
  'market',
  'intellig',
  'divis',
  'sp',
  'global',
  'inc',
  'right',
  'reserv',
  's

In [8]:
all_words = []
for doc in clean_corpus:
    all_words += doc

# define "word_frequencies" as the frequencies of all of the words in "all_words"
word_frequencies = nltk.FreqDist(all_words)
# print the 20 most common words
print(word_frequencies.most_common(20))


# put it in a data frame
import pandas as pd

# Make data frame from dictionary
term_df = pd.DataFrame({'count':list(word_frequencies.values())}, index = word_frequencies.keys())
# sort the values so that the terms with the highest counts are at the top
term_df.sort_values('count', ascending = False, inplace = True)

[('busi', 165), ('sp', 164), ('global', 164), ('market', 135), ('2018', 130), ('inc', 104), ('divis', 101), ('invest', 93), ('growth', 93), ('us', 90), ('financi', 83), ('call', 83), ('year', 82), ('expect', 75), ('think', 73), ('earn', 72), ('well', 72), ('that', 72), ('rate', 72), ('2019', 68)]


In [15]:
term_df.head()

Unnamed: 0,count
compani,39
million,29
2017,27
2016,20
state,17


In [10]:
term_df.to_csv('freq.csv')