# Search and Analyze with Open Collections API

### Import and setup all the things (again)

In [None]:
import json, requests, math, re, string, nltk

# allow matplotlib to run in-line
% matplotlib inline 

nltk.download("punkt") # Word tokenizer
nltk.download("stopwords") # Stop words
from nltk import word_tokenize

ocUrl = 'https://open.library.ubc.ca/'
ocApiUrl = 'https://oc-index.library.ubc.ca' # APPY URL

### Set our API Key

You can get your own API key at https://open.library.ubc.ca/research

In [None]:
apiKey = 'ac40e6c2cb345593ed1691e0a8b601bba398e42d85f81f893c5ab709cec63c6c'

### Search Query

In [None]:
query = '"Master of Journalism"'

### Repositories

__Available repositories__

* __oc__ - all repositories
* __dsp__ - only DSpace / cIRcle
* __cdm__ - only ContentDM
* __atm__ - only AtoM

In [None]:
repo = 'dsp'

### Build the search

https://open.library.ubc.ca/research to build a query

In [None]:
search = dict()

search['from'] = 0
search['size'] = 10
search['type'] = 'object'
search['body'] = dict()

# Sort settings
search['body']['sort'] = dict()
search['body']['sort']['_score'] = dict()
search['body']['sort']['_score']['order'] = 'desc'

# Fields to return
search['body']['fields'] = []
search['body']['fields'].append('title')
search['body']['fields'].append('ubc.transcript')
search['body']['fields'].append('description')
search['body']['fields'].append('ubc.internal.provenance.nick')

# Query String
search['body']['query'] = dict()
search['body']['query']['query_string'] = dict()
search['body']['query']['query_string']['query'] = query

#Set the repo
search['index'] = repo

jsonSearch = json.JSONEncoder(search)

print(json.dumps(search, indent=4, sort_keys=True))

### POST the search

In [None]:
searchUrl = ocApiUrl+'/search?apiKey='+apiKey
apiResponse = requests.post(searchUrl, json=search).json()

print(json.dumps(apiResponse, indent=4, sort_keys=True))

### Get just the Items

In [None]:
apiItems = apiResponse['data']['data']['hits']['hits']
print(apiItems)

### Parse items and clean full text

In [None]:
items = []
for apiItem in apiItems:
    item = dict()
    item['id'] = apiItem['_id']
    item['title'] = apiItem['fields']['title'][0]
    item['description'] = apiItem['fields']['description'][0]
    item['collection'] = apiItem['fields']['ubc.internal.provenance.nick'][0]
    
    # Clean Full Text
    cleanFullText = apiItem['fields']['ubc.transcript'][0].lower()
    pattern = re.compile('[\W_]+')
    cleanFullText = pattern.sub(' ', cleanFullText)
    
    item['fullText'] = cleanFullText
    item['words'] = word_tokenize(cleanFullText)
    items.append(item)
    
print(items)

### Item with most words?

In [None]:
mostWords = 0
winner = 0
for key, item in enumerate(items):
    if(len(item['words']) > mostWords):
        mostWords = len(item['words'])
        winner = key

print("Winner is "+ocUrl+'collections/'+items[winner]['collection']+'/items/'+items[winner]['id'] + 
      " with "+str(mostWords)+ " words!")

### Item with most unique words?

In [None]:
mostWords = 0
winner = 0
for key, item in enumerate(items):
    if(len(set(item['words'])) > mostWords):
        mostWords = len(set(item['words']))
        winner = key

print("Winner is "+ocUrl+'collections/'+items[winner]['collection']+'/items/'+items[winner]['id'] + 
      " with "+str(mostWords)+ " unique words!")

### Richest in vocabulary

We can investigate the lexical richness of a text. For example, by dividing the total number of words by the number of unique words, we can see the average number of times each word is used.


In [None]:
lrScore = 0
winner = 0
for key, item in enumerate(items):
    if(len(item['words'])/len(set(item['words'])) > lrScore):
        lrScore = round(len(item['words'])/len(set(item['words'])))
        winner = key

print(ocUrl+'collections/'+items[winner]['collection']+'/items/'+items[winner]['id'] + 
      " \nis the richest in vocabulary with each word being used an average of "+str(lrScore)+ " times")

### Combining all the words

In [None]:
allWords = []
for item in items:
    allWords += item['words']
print(str(len(allWords)) + " words in total")
# print(allWords)

### Searching within the full text

In [None]:
search = "truth"
# search = "the"
text = nltk.Text(allWords)
text.count(search)

### Percentage of full text that the search takes up

In [None]:
100.0*allWords.count(search)/len(allWords) 

### Concordance search on the full text

In [None]:
text.concordance(search)

### Lexical dispersion of search

In [None]:
import numpy
text.dispersion_plot([search])

### Words used similarly to our search

In [None]:
# search ='government'
text.similar(search) # How does this work? Magic obviously!

### Collocations

In [None]:
text.collocations()

### Bi-grams

In [None]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(items[0]['words'])
finder.apply_freq_filter(3)
sorted(finder.nbest(bigram_measures.pmi, 10))

### Tri-grams 

In [None]:
tokens = items[0]['words']
finder = TrigramCollocationFinder.from_words(tokens)
trigram_measures = nltk.collocations.TrigramAssocMeasures()
# Permit 'and' to appear in the middle of a trigram, but not on either edge:
finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3))
# score trigram based on frequency
scored = finder.score_ngrams(trigram_measures.raw_freq)
sorted(finder.nbest(trigram_measures.raw_freq, 5))