# Full Text of a Collection

Next we can look at getting the entire full text of a collection, some of our collections are very large and are slow to do analysis on so with this tool I suggest you use one of our smaller full text collections like:
 - darwin
 - florence
 - vma

## Getting the items

In [None]:
import json, requests, math, re, string, nltk

nltk.download("punkt") # Word tokenizer
nltk.download("stopwords") # Stop words
from nltk import word_tokenize

### Choose a collection and get its item count

In [None]:
# Collection to get data from
collection = 'darwin'

# Query the API for the collection information
collectionUrl = 'https://oc-index.library.ubc.ca/collections/'+collection
apiResponse = requests.get(collectionUrl).json()

# Get count of items in collection
itemCount = apiResponse['data']['items']

# Get collection name
collectionTitle = apiResponse['data']['title']
'Collection: "' + collectionTitle + '" has ' + str(itemCount) + ' items'

### Loop through collection item list to get item ids

In [None]:
perPage = 25
offset = 0
pages = math.ceil(itemCount / 25)

# Loop through collection item pages to get all items
itemIds = []
for x in range(0, pages):
    collectionItemsUrl = 'https://oc-index.library.ubc.ca/collections/'+collection+'/items?limit='+str(perPage)+'&offset='+str(offset)
    offset += 25 
    # Get list of 25 items
    apiResponse = requests.get(collectionItemsUrl).json()
    collectionItems = apiResponse['data']
    # Add each item id to the itemIds list
    for collectionItem in collectionItems:
        itemIds.append(collectionItem['_id'])
print(itemIds)

### Loop through item ids and get the item data

In [None]:
items = []
fullTexts = []
for itemId in itemIds : 
    itemUrl = 'https://oc-index.library.ubc.ca/collections/'+collection+'/items/'+itemId
    apiResponse = requests.get(itemUrl).json()
    item = apiResponse['data']
    items.append(item)
    if 'FullText' in item:
        fullText = item['FullText'][0]['value']
        # Lower case full text
        cleanFullText = fullText.lower()
        # Remove everything but words
        pattern = re.compile('[\W_]+')
        cleanFullText = pattern.sub(' ', cleanFullText)
        # Add to the full texts list
        fullTexts.append(cleanFullText)
    else:
        fullTexts.append('')
'Done'

### Character Count

In [None]:
characterLength = 0
for fullText in fullTexts:
    characterLength += len(fullText)
characterLength

### Token count

In [None]:
tokens = []
for fullText in fullTexts:
    tokens += word_tokenize(fullText)
len(tokens)

### Unique token count

In [None]:
len(set(tokens))

### Average number of times a token is used

In [None]:
len(tokens)/len(set(tokens))

### Search hit count

In [None]:
search = "will"

In [None]:
text = nltk.Text(tokens)
text.count(search)

### Percentage of full text that the search takes up

In [None]:
100.0*fullText.count(search)/len(fullText) 

### Concordance search on the full text

In [None]:
text.concordance(search)

### Words used similarly to search

In [None]:
text.similar(search)

### Longest words in full text

In [None]:
v = set(text)
long_words = [word for word in v if len(word) > 15]
sorted(long_words)

### Collocations

In [None]:
text.collocations()

### Lexical dispersion of search

In [None]:
import numpy
# allow visuals to show up in this interface-
% matplotlib inline 
text.dispersion_plot([search])

### Frequency Distribution of words

In [None]:
from nltk import FreqDist
fdist = FreqDist(text)
fdist.most_common(50)

In [None]:
fdist.plot(25)