Skip to content

Commit

Permalink
Add first draft
Browse files Browse the repository at this point in the history
  • Loading branch information
nichtich committed Sep 1, 2017
0 parents commit 9babc46
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
node_modules
build
1 change: 1 addition & 0 deletions .npmignore
@@ -0,0 +1 @@
# empty to just ignore .gitignore and include the 'build' folder
17 changes: 17 additions & 0 deletions README.md
@@ -0,0 +1,17 @@
# WikiCite data

This repository contains scripts to extract, transform, and analyze bibliographic data from Wikidata.

[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Node](https://img.shields.io/badge/node-%3E=%20v6.4.0-brightgreen.svg)](http://nodejs.org)

Source code is based and makes use of the modules [wikidata-filter](https://www.npmjs.com/package/wikidata-filter)
and [wikidata-sdk](https://www.npmjs.com/package/wikidata-sdk) by Maxime Lathuilière.

## Usage

bzcat latest-all.json.bz2 | ./bin/wikicite-extract > wikicite.ndjson

## License

[MIT](LICENSE.md)
3 changes: 3 additions & 0 deletions bin/wikicite-extract
@@ -0,0 +1,3 @@
#!/usr/bin/env node
var path = require('path')
require(path.join(__dirname, '..', 'index.js'))
14 changes: 14 additions & 0 deletions index.js
@@ -0,0 +1,14 @@
#!/usr/bin/env node

const split = require('split')
const filter = require('wikidata-filter/lib/filter')
const getClasses = require('./lib/get_classes')

getClasses('Q732577', (classes) => {
const itemFilter = require('./lib/item_filter')(classes)

process.stdin
.pipe(split())
.pipe(filter(itemFilter))
.pipe(process.stdout)
})
15 changes: 15 additions & 0 deletions lib/get_classes.js
@@ -0,0 +1,15 @@
const wdk = require('wikidata-sdk')
const request = require('request')

module.exports = function (root, callback) {
const sparql = "SELECT ?type WHERE { ?type wdt:P279* wd:"+root+" }"
const url = wdk.sparqlQuery(sparql)

request(url, (error, response, body) => {
const results = JSON.parse(body).results
const classes = results.bindings.map( (value) => {
return value.type.value.substr(31)
})
callback(classes)
})
}
45 changes: 45 additions & 0 deletions lib/item_filter.js
@@ -0,0 +1,45 @@
const parseLine = require('wikidata-filter/lib/parse_line')
const wdk = require('wikidata-sdk')

module.exports = function (types) {

// store types in an object for fast lookup
const typeMap = types.reduce(function(map, qid) {
map[qid] = true;
return map;
}, {});

return (line) => {
const item = parseLine(line)
if (!item || item.type != 'item') return null
if (!item.claims.P31) return null

try { // https://github.com/maxlath/wikidata-sdk/issues/17
simplify(item, 'claims')
} catch (e) {
return null
}

if (!filterType(item.claims.P31, typeMap)) return null

simplify(item, 'labels')
simplify(item, 'descriptions')
simplify(item, 'aliases')
simplify(item, 'sitelinks')

return JSON.stringify(item) + '\n'
}
}

const filterType = (P31, types) => {
for(var i=0; i<P31.length; i++) {
if (P31[i] in types) return true;
}
return false
}

const simplify = (item, attr) => {
if (item[attr]) {
item[attr] = wdk.simplify[attr](item[attr])
}
}
20 changes: 20 additions & 0 deletions package.json
@@ -0,0 +1,20 @@
{
"name": "wikicite-data",
"version": "0.0.1",
"description": "Extract and transform bibliographic data from Wikidata",
"main": "index.js",
"dependencies": {
"request": "^2.0",
"wikidata-filter": "^2.0"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [
"wikidata",
"wikicite"
],
"author": "Jakob Voss <voss@gbv.de>",
"license": "MIT"
}

0 comments on commit 9babc46

Please sign in to comment.