# Collection Analysis with Open Collections API

### Import and setup all the things

In [None]:
import json, requests, math, re, string, nltk

# allow matplotlib to run in-line
% matplotlib inline 

ocUrl = 'https://open.library.ubc.ca/'
ocApiUrl = 'https://oc-index.library.ubc.ca' # APPY URL

### Set our API Key

You can get your own API key at https://open.library.ubc.ca/research

In [None]:
apiKey = 'ac40e6c2cb345593ed1691e0a8b601bba398e42d85f81f893c5ab709cec63c6c'

### Choose a collection

In [None]:
collection = 'darwin'

### Get collection info using the API

In [None]:
# Query the API for the collection information
collectionUrl = ocApiUrl+'/collections/'+collection+'?api_key='+apiKey
apiResponse = requests.get(collectionUrl).json()

# Get count of items in collection
itemCount = apiResponse['data']['items']

# Get collection name
collectionTitle = apiResponse['data']['title']
'Collection: "' + collectionTitle + '" has ' + str(itemCount) + ' items'

### GET and store the collections items using the API

In [None]:
perPage = 25
offset = 0
pages = math.ceil(itemCount / 25)

# Loop through collection item pages to get all items
itemIds = []
for x in range(0, pages):
    collectionItemsUrl = ocApiUrl+'/collections/'+collection
    collectionItemsUrl += '/items?limit='+str(perPage)+'&offset='+str(offset)+'&api_key='+apiKey
    offset += 25 
    # Get list of 25 items
    apiResponse = requests.get(collectionItemsUrl).json()
    collectionItems = apiResponse['data']
    # Add each item id to the itemIds list
    for collectionItem in collectionItems:
        itemIds.append(collectionItem['_id'])
print(itemIds)

### GET and store items and full text using the API

In [None]:
items = []
fullTexts = []
for itemId in itemIds : 
    itemUrl = ocApiUrl+'/collections/'+collection+'/items/'+itemId
    apiResponse = requests.get(itemUrl).json()
    item = apiResponse['data']
    easyItem = dict()
    easyItem['id'] = itemId
    easyItem['title'] = item['Title'][0]['value']
    if 'FullText' in item:
        fullText = item['FullText'][0]['value']
        # Lower case full text
        cleanFullText = fullText.lower()
        # Remove everything but words
        pattern = re.compile('[\W_]+')
        cleanFullText = pattern.sub(' ', cleanFullText)
        # Add to the full texts list
        fullTexts.append(cleanFullText)
        easyItem['fullText'] = item['FullText'][0]['value']
    else:
        easyItem['fullText'] = ''
        fullTexts.append('')
        
    items.append(easyItem)
        
print(fullTexts)

## Text Analysis with NLTK

So now we have the item's full text we are going to use the Natural Language Toolkit to perform some analysis on it using NLTK.

NLTK is a Python Library for working with written language data. It is free and very well documented. Many areas we'll be covering are treated in more detail in the NLTK Book, available for free online from [here](http://www.nltk.org/book/).

> Note: NLTK provides tools for tasks ranging from very simple (counting words in a text) to very complex (writing and training parsers, etc.). Many advanced tasks are beyond the scope of this talk, but by the time we're done, you should understand Python and NLTK well enough to perform these tasks on your own!

Firstly, we will need to import NLTK.

In [None]:
import nltk # imports all the nltk basics
nltk.download("punkt") # Word tokenizer
nltk.download("stopwords") # Stop words
from nltk import word_tokenize

### Total character count

In [None]:
characterLength = 0
for fullText in fullTexts:
    characterLength += len(fullText)
print(characterLength)

### Total word count

*For our analysis, we want to break up the full text into words, this step is called tokenization*

In [None]:
totalTokens = []

c = 0
for fullText in fullTexts:
    tokenisedText = word_tokenize(fullText)
    totalTokens += tokenisedText
    items[c]['words'] = tokenisedText
    c = c + 1
#print(tokens)
len(totalTokens)

### Item with most words

In [None]:
highestWordCount = 0;
highestId= 0;

for key, item in enumerate(items):
    if(len(item['words']) > highestWordCount):
        highestWordCount = len(item['words'])
        highestId = key

print(ocUrl+collection+'/items/'+items[highestId]['id'] + ' has the most words with ' + str(highestWordCount) + ' words')

### Total unique word count

In [None]:
len(set(totalTokens))

### Item with most unique words?

In [None]:
uniqueWordCount = 0;
winner = 0;

for key, item in enumerate(items):
    if(len(set(item['words'])) > uniqueWordCount):
        uniqueWordCount = len(set(item['words']))
        winner = key

print(ocUrl+collection+'/items/'+items[highestId]['id'] + ' has the most words with ' + str(uniqueWordCount) + ' unique words')

### Longest words

In [None]:
v = set(totalTokens)
long_words = [word for word in v if len(word) > 13]
sorted(long_words)

### Collocations

In [None]:
# Get sequence of words or terms that co-occur more often than would be expected by chance. 
text = nltk.Text(totalTokens)
text.collocations()

### Frequency Distributions

In [None]:
from nltk import FreqDist
fdist = FreqDist(text)
fdist.most_common(50)

In [None]:
# Now we can plot the frequency distributions
fdist.plot(30)

### Hapaxes

In [None]:
fdist.hapaxes() # words that occur only once