# Data Librarian Modul 2: **PyTerrier** Homework

In [1]:
import pyterrier as pt
import requests
import pandas as pd
import pickle
import nltk
import numpy as np
import os

from tools import littlehelper as lilhelp
from pathlib import Path

os.environ["JAVA_HOME"] = "./jre/Contents/Home/"

  from .autonotebook import tqdm as notebook_tqdm


# Load data from [bibsonomy.org](https://www.bibsonomy.org)

In [2]:
queries = ['makerspace', 'virtual reality', 'mixed reality', 'augmented reality']

df = lilhelp.df_from_bibsonomy_query(queries)

Query: 'makerspace'. Number of items retrieved: 187
Query: 'virtual reality'. Number of items retrieved: 1332
Query: 'mixed reality'. Number of items retrieved: 1071
Query: 'augmented reality'. Number of items retrieved: 1256


In [3]:
df.shape

(3846, 128)

In [8]:
df['search_string'].value_counts()

search_string
virtual reality      1332
augmented reality    1256
mixed reality        1071
makerspace            187
Name: count, dtype: int64

In [6]:
df.head(3)

Unnamed: 0,type,id,tags,intraHash,label,user,description,date,changeDate,count,...,citeulike-linkout-3,opac,status,oai-id,oai-set,unit,details,documenturl,review,journaltitle
0,Bookmark,https://www.bibsonomy.org/url/a10ba61e8b7f7ca6...,[Unsortierte_Lesezeichen],a10ba61e8b7f7ca6db55da2670090ef2,Attraktor - Der Makerspace in Hamburg,robingarcia,Der gemeinnützige Attraktor e. V. betreibt den...,2017-03-17 10:34:46,2017-04-13 09:59:00,1,...,,,,,,,,,,
1,Bookmark,https://www.bibsonomy.org/url/6093e51d381e6c5c...,"[educação, maker]",6093e51d381e6c5c2a7d5ee954f7c1ee,Why Makerspaces are Changing the World – Betab...,christianoavila,While in college I learned about engineering i...,2017-05-09 12:48:55,2017-05-09 12:48:55,1,...,,,,,,,,,,
2,Bookmark,https://www.bibsonomy.org/url/90a1ee85e2acf284...,"[thinking, design, makerspace, makers, learnin...",90a1ee85e2acf284765373ea8756d255,Design Thinking Process and UDL Planning Tool ...,yish,"If there is a makerspace in your school, it ma...",2017-12-25 16:06:35,2017-12-25 16:06:35,1,...,,,,,,,,,,


In [7]:
df.tail(3)

Unnamed: 0,type,id,tags,intraHash,label,user,description,date,changeDate,count,...,citeulike-linkout-3,opac,status,oai-id,oai-set,unit,details,documenturl,review,journaltitle
3843,Publication,https://www.bibsonomy.org/bibtex/2346a33fc433a...,[dblp],346a33fc433ac6690a79308a135079ee,TigerEye: augmented reality for Clemson Univer...,dblp,,2018-11-06 00:00:00,2018-11-07 14:39:27,1,...,,,,,,,,,,
3844,Publication,https://www.bibsonomy.org/bibtex/2d9c365efe0da...,[dblp],d9c365efe0daa197ca15113c2edb8cc0,A Projection-based Medical Augmented Reality S...,dblp,,2018-11-06 00:00:00,2018-11-07 13:34:32,1,...,,,,,,,,,,
3845,Publication,https://www.bibsonomy.org/bibtex/2b89044a3cc00...,[dblp],b89044a3cc004ed5913d45bf022d191a,REVEL: tactile feedback technology for augment...,dblp,,2018-11-06 00:00:00,2018-11-07 12:27:03,1,...,,,,,,,,,,


## Load cleaned and filtered `csv`

In [62]:
filepath = Path('../data/makerspace-bibsonomy-clean.json')

In [63]:
df = pd.read_json(filepath, dtype={'isbn': 'int32'})
df.head()

Unnamed: 0,search_string,pup_type,year,author_first,authors,author_count,title,abstract,booktitle,journal,tags,isbn,id
0,makerspace,incollection,2017,"Bronkar, Cherie",[Cherie Bronkar],1,How to start a library makerspace,You may have heard the term ``makerspace'' and...,The makerspace librarian's sourcebook,,[greenbib],9780838915042.0,https://www.bibsonomy.org/bibtex/2e033978aa497...
1,makerspace,article,2020,"Oliver, Kevin M.","[Kevin M. Oliver, Jennifer K. Houchins, Robert...",4,Informing Makerspace Outcomes Through a Lingui...,A growing body of research focuses on what out...,,International Journal of Science and Mathemati...,"[assessment, outcomes, makerspace, learningana...",,https://www.bibsonomy.org/bibtex/2fb48ac294725...
2,makerspace,standard,2017,,,0,The makerspace librarian's sourcebook,The Makerspace Librarian's Sourcebook is an es...,,,[greenbib],9780838915042.0,https://www.bibsonomy.org/bibtex/2789d019603b4...
3,makerspace,book,2020,"Denzer, Juan","[Juan Denzer, Sharona Ginsberg]",2,Terrific makerspace projects: A practical guid...,Step-by-step instructions to guide you through...,,,[greenbib],9781538131824.0,https://www.bibsonomy.org/bibtex/2309634b7a919...
4,makerspace,incollection,2017,"Ginsberg, Sharona",[Sharona Ginsberg],1,Sustainability: Keeping the library makerspace...,,The makerspace librarian's sourcebook,,[greenbib],9780838915042.0,https://www.bibsonomy.org/bibtex/21c020a4a195d...


In [64]:
df_prep = df.rename(columns={'title': 'text',
                             'id': 'docno'})
df_prep.head(2)

Unnamed: 0,search_string,pup_type,year,author_first,authors,author_count,text,abstract,booktitle,journal,tags,isbn,docno
0,makerspace,incollection,2017,"Bronkar, Cherie",[Cherie Bronkar],1,How to start a library makerspace,You may have heard the term ``makerspace'' and...,The makerspace librarian's sourcebook,,[greenbib],9780838915042.0,https://www.bibsonomy.org/bibtex/2e033978aa497...
1,makerspace,article,2020,"Oliver, Kevin M.","[Kevin M. Oliver, Jennifer K. Houchins, Robert...",4,Informing Makerspace Outcomes Through a Lingui...,A growing body of research focuses on what out...,,International Journal of Science and Mathemati...,"[assessment, outcomes, makerspace, learningana...",,https://www.bibsonomy.org/bibtex/2fb48ac294725...


In [90]:
makerspace_dict = df_prep.to_dict(orient='records')
makerspace_dict[0:1]

[{'search_string': 'makerspace',
  'pup_type': 'incollection',
  'year': 2017,
  'author_first': 'Bronkar, Cherie',
  'authors': ['Cherie Bronkar'],
  'author_count': 1,
  'text': 'How to start a library makerspace',
  'abstract': "You may have heard the term ``makerspace'' and wondered what it meant. Makerspaces are, simply put, places where people gather to make things. Although that may sound like a simplistic definition, the things that can be created in a makerspace vary a great deal. Makerspaces can be high tech, low tech, and everything in between. A makerspace's offerings revolve around the needs of the community it serves, but the one thing all have in common is that they bring people together to share ideas. Typically, the first thing that comes to mind when thinking about mak- erspaces is 3D printing, but when it comes to what’s going on in makerspaces around the world, that’s just the tip of the iceberg. Makers create things, ideas, and concepts. Makers work in metal, wood,

## Init PyTerrier

In [66]:
if not pt.started():
    pt.init()

## Create `pt.IterDictIndexer`

In [67]:
# Folder to store index
index_folder_mult = './makerspace_index_mult'

# Dict fields for index
fields = ['docno', 'text', 'abstract', 'tags', 'pup_type']

# Create indexer object for dictionary == IterDictIndexer
indexer_mult = pt.IterDictIndexer(index_folder_mult,
                                  meta={'docno': 200, 'text': 4096},
                                  overwrite=True)

# Create index by passing data -> Dict and fields
index_ref_mult = indexer_mult.index(makerspace_dict, fields=fields)

## Create Indexer

In [68]:
index_mult = pt.IndexFactory.of(index_ref_mult)

## Get index stats

In [69]:
print(index_mult.getCollectionStatistics().toString())

Number of documents: 3149
Number of terms: 6887
Number of postings: 81898
Number of fields: 5
Number of tokens: 106500
Field names: [docno, text, abstract, tags, pup_type]
Positions:   false



## Sort index

In [158]:
tf_dict = {}

for x in index_mult.getLexicon():
    tf_dict[x.getKey()] = x.getValue().frequency

In [159]:
for key, value in tf_dict.items():
    print(key, value)

0 71
00 3
000 1
001 15
002 2
003 1
005 2
006 1
008 1
0098 1
01 6
018 1
024 1
026 1
029 1
03 1
035 1
039 6
04 2
045 2
05 3
06 1
0i 1
1 78
10 19
101 3
102 2
103 9
105 3
10th 2
11 5
1136 1
115 1
119 4
12 20
120 1
125 2
128 3
13 6
130 4
14 2
142 1
142p 1
14th 4
15 11
150 1
16 3
160 1
165 2
17 2
170 1
18 1
183 1
19 2
1909 1
191 2
195 2
1960 1
1984 1
1988 1
1990 1
1991 1
1993 2
1995 2
1996 2
1997 2
1pp 1
1st 1
2 76
20 9
200 3
2001 3
2002 2
2004 1
2005 2
2006 17
2007 10
2008 8
2009 1
2011 1
2012 2
2013 4
2015 8
2016 5
2017 8
2018 6
2019 8
2020 5
2021 2
2025 2
21 4
210 1
22 3
23 11
230v 1
233 1
24 9
2456 8
25 6
26 6
267 1
27 4
28 4
29 3
293 3
2d 35
2x2 1
3 76
30 5
3000 2
306 1
31 1
313 1
32 6
328 1
33 4
34 3
35 2
36 10
360 10
360proto 2
37 2
376 1
38 2
39 3
3c 1
3d 220
3dtv 1
3dui 1
3pp 2
3rd 1
3visual 3
4 39
40 6
41 1
42 8
43 1
44 1
440 1
4422 1
45 4
453 2
46 3
47 1
48 2
49 2
4d 4
5 29
50 11
51 2
53 2
5300 1
54 8
55 2
56 5
57 3
594 2
5a 1
5g 1
5th 1
6 15
60 4
6071 2
61 2
612 1
619 1
627 1
64 

In [163]:
tf_dict_sorted = sorted(tf_dict.items(),
                        key=lambda x: x[1],
                        reverse=True)

tf_dict_sorted[:20]

[('dblp', 4809),
 ('realiti', 4214),
 ('http', 3173),
 ('www', 3165),
 ('org', 3152),
 ('bibsonomi', 3149),
 ('bibtex', 3149),
 ('null', 2618),
 ('virtual', 2251),
 ('inproceed', 1899),
 ('augment', 1722),
 ('mix', 1205),
 ('articl', 934),
 ('system', 748),
 ('interact', 700),
 ('vr', 658),
 ('user', 620),
 ('environ', 565),
 ('applic', 545),
 ('us', 463)]

## Create `tf` function

In [165]:
tf = pt.BatchRetrieve(index_mult, wmodel='Tf')

In [166]:
tf.search('makerspace')

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,20,https://www.bibsonomy.org/bibtex/21d2cad7623d1...,0,13.0,makerspace
1,1,11,https://www.bibsonomy.org/bibtex/21b40218abc28...,1,12.0,makerspace
2,1,0,https://www.bibsonomy.org/bibtex/2e033978aa497...,2,10.0,makerspace
3,1,5,https://www.bibsonomy.org/bibtex/2b80b466b358b...,3,9.0,makerspace
4,1,15,https://www.bibsonomy.org/bibtex/2c1f2498fcee3...,4,9.0,makerspace
...,...,...,...,...,...,...
112,1,134,https://www.bibsonomy.org/bibtex/258004bc801da...,112,1.0,makerspace
113,1,135,https://www.bibsonomy.org/bibtex/219d68fbf0700...,113,1.0,makerspace
114,1,141,https://www.bibsonomy.org/bibtex/23f763f36ee3a...,114,1.0,makerspace
115,1,145,https://www.bibsonomy.org/bibtex/248efc34280eb...,115,1.0,makerspace


In [172]:
tf.search('library makerspaces')

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,20,https://www.bibsonomy.org/bibtex/21d2cad7623d1...,0,24.0,library makerspaces
1,1,11,https://www.bibsonomy.org/bibtex/21b40218abc28...,1,21.0,library makerspaces
2,1,5,https://www.bibsonomy.org/bibtex/2b80b466b358b...,2,15.0,library makerspaces
3,1,15,https://www.bibsonomy.org/bibtex/2c1f2498fcee3...,3,15.0,library makerspaces
4,1,16,https://www.bibsonomy.org/bibtex/27b26f179035f...,4,14.0,library makerspaces
...,...,...,...,...,...,...
123,1,1074,https://www.bibsonomy.org/bibtex/2c6457055e200...,123,1.0,library makerspaces
124,1,1153,https://www.bibsonomy.org/bibtex/2e24c3575bfaf...,124,1.0,library makerspaces
125,1,1331,https://www.bibsonomy.org/bibtex/27f16523a702d...,125,1.0,library makerspaces
126,1,1380,https://www.bibsonomy.org/bibtex/2b3910f9325d8...,126,1.0,library makerspaces


## Create search engine with `tf_idf` model

In [169]:
tf_idf = pt.BatchRetrieve(index_mult, wmodel='TF_IDF')

In [171]:
result = tf_idf.search('makerspace')
result

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,5,https://www.bibsonomy.org/bibtex/2b80b466b358b...,0,4.507168,makerspace
1,1,0,https://www.bibsonomy.org/bibtex/2e033978aa497...,1,4.348573,makerspace
2,1,19,https://www.bibsonomy.org/bibtex/2a34c65255fec...,2,4.270598,makerspace
3,1,8,https://www.bibsonomy.org/bibtex/26307bb287f16...,3,4.164843,makerspace
4,1,68,https://www.bibsonomy.org/bibtex/28a2b925ea75f...,4,4.147915,makerspace
...,...,...,...,...,...,...
112,1,44,https://www.bibsonomy.org/bibtex/22c2e1fdec9c9...,112,2.126453,makerspace
113,1,35,https://www.bibsonomy.org/bibtex/26aad0c29af4f...,113,2.039099,makerspace
114,1,103,https://www.bibsonomy.org/bibtex/2ed52c9a11b48...,114,1.655178,makerspace
115,1,63,https://www.bibsonomy.org/bibtex/2648f20b0b93a...,115,1.190985,makerspace


## Function for doc info retrieval

In [144]:
def query_search_engine(query, data_dict, search_engine, limit=10):
    """Query a PyTerrier `pt.BatchRetrieve` and match the result
    to the dictionary that `pt.BatchRetrieve` uses to build its index.

    Args:
        query (`str`): String to query index.
        data_dict (`dict`): Dictionary containing all documents of the index.
        search_engine (`pt.BatchRetrieve`): Search engine
        limit (`int`): Number of query results
    """
    result = search_engine.search(query)
    x = 0
    for _, row in result.iterrows():
        if x == limit:
            break
        for entry in data_dict:
            if entry['docno'] == row['docno']:
                print(f"Title: \t\t {entry['text']}")
                print(f"Author(s): \t {entry['authors']}")
                print(f"Abstract: \t {entry['abstract']}")
                print(f"Year: \t\t {entry['year']}")
                print(f"URL: \t\t {entry['docno']}")
                print(f"Score: \t\t {row['score']:.2f}")
                print()
                x += 1      

In [167]:
query_search_engine(query='library makerspaces', 
                    data_dict=makerspace_dict,
                    search_engine=tf_idf,
                    limit=3)

Title: 		 Makerspaces: A practical guide for librarians
Author(s): 	 ['John J. Burke', 'Ellyssa Kroski']
Abstract: 	 This book is a guidebook jam-packed with resources, advice, and information to help you develop and fund your own makerspace from the ground up. Learn what other libraries are making, building, and doing in their makerspaces and how you can, too. Readers are introduced to makerspace equipment, new technologies, models for planning and assessing projects, and useful case studies that will equip them with the knowledge to implement their own library makerspaces. This expanded second edition features eighteen brand new library makerspace profiles providing advice and inspiration for how to create your own library makerspace, over twenty new images and figures illustrating maker tools and trends as well as library makerspaces in action and new lists of actual grant and funding sources for library makerspaces.
Year: 		 2018
URL: 		 https://www.bibsonomy.org/bibtex/2b80b466b35