Skip to content

Commit

Permalink
Merge pull request #938 from eranroz/master
Browse files Browse the repository at this point in the history
Adding scrappers for Haaretz, TheMarker and Ynet
  • Loading branch information
adam3smith committed Aug 17, 2015
2 parents 26e564a + 1f6ccfe commit 911d25e
Show file tree
Hide file tree
Showing 3 changed files with 342 additions and 0 deletions.
147 changes: 147 additions & 0 deletions Haaretz.js
@@ -0,0 +1,147 @@
{
"translatorID": "d6f64d96-aa6f-4fd3-816f-bdef842c7088",
"label": "Haaretz",
"creator": "Eran Rosenthal",
"target": "^https?://www\\.haaretz\\.(co\\.il|com)/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-08-17 22:01:10"
}

/**
Copyright (c) 2015 Eran Rosenthal
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU Affero General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/

function detectWeb(doc, url) {
if (ZU.xpathText(doc, '//header//h1')) {
return 'newspaperArticle';
}
}

function doWeb(doc, url) {
var item = new Zotero.Item('newspaperArticle');
item.title = ZU.xpathText(doc, '//header//h1');
item.url = url;
if(url.indexOf('haaretz.com') != -1) {
item.publicationTitle = 'Haaretz';
item.language = 'en';
} else{
item.publicationTitle = 'הארץ';
item.language = 'he';
}

var abstract = ZU.xpathText(doc, '//header/p');
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content');
item.abstractNote = abstract;

var authors = ZU.xpath(doc, '//address/a[@rel="author"]');
for(var i=0; i<authors.length; i++) {
item.creators.push(ZU.cleanAuthor(authors[i].textContent, 'author'));
}

item.date = ZU.xpathText(doc, '//time[@itemprop="datePublished"]/@datetime').split('T')[0];
var keywords = ZU.xpathText(doc, '//head/meta[@name="news_keywords"]/@content').split(',');
for(var i=0; i<keywords.length; i++) {
if(keywords[i].length>0) item.tags.push(keywords[i].trim());
}
item.complete();
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.haaretz.com/news/diplomacy-defense/1.671202",
"items": [
{
"itemType": "newspaperArticle",
"title": "Islamic Jihad: If Hunger Striker Dies, We'll Respond With Force Against Israel",
"creators": [
{
"firstName": "Jack",
"lastName": "Khoury",
"creatorType": "author"
},
{
"firstName": "Shirly",
"lastName": "Seidler",
"creatorType": "author"
},
{
"firstName": "Ido",
"lastName": "Efrati",
"creatorType": "author"
}
],
"date": "2015-08-14",
"abstractNote": "Islamic Jihad says it will no longer be committed to maintaining calm if Mohammed Allaan, who lost consciousness after 60-day hunger strike, dies.",
"language": "en",
"libraryCatalog": "Haaretz",
"publicationTitle": "Haaretz",
"shortTitle": "Islamic Jihad",
"url": "http://www.haaretz.com/news/diplomacy-defense/1.671202",
"attachments": [],
"tags": [
"Palestinian hunger strike"
],
"notes": [],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "http://www.haaretz.co.il/news/politics/1.2708080",
"items": [
{
"itemType": "newspaperArticle",
"title": "פלסטיני דקר חייל ופצע אותו באורח קל בכביש 443 סמוך לבית חורון",
"creators": [
{
"firstName": "גילי",
"lastName": "כהן",
"creatorType": "author"
},
{
"firstName": "עמירה",
"lastName": "הס",
"creatorType": "author"
}
],
"date": "2015-08-15",
"abstractNote": "כוח צה\"ל שהיה במקום פתח באש לעבר הפלסטיני ופצע אותו באורח קל, והוא נעצר. החייל והדוקר פונו לבית החולים שערי צדק. בתחילת השבוע נדקר באזור צעיר ישראלי נוסף שנפצע בינוני",
"language": "he",
"libraryCatalog": "Haaretz",
"publicationTitle": "הארץ",
"url": "http://www.haaretz.co.il/news/politics/1.2708080",
"attachments": [],
"tags": [
"טרור",
"פיגוע",
"פלסטינים",
"צה\"ל"
],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/
105 changes: 105 additions & 0 deletions TheMarker.js
@@ -0,0 +1,105 @@
{
"translatorID": "b2d61bb5-5b21-41b7-9c83-1abcbf14639b",
"label": "TheMarker",
"creator": "Eran Rosenthal",
"target": "^https?://www\\.themarker\\.com/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-08-17 22:04:49"
}

/**
Copyright (c) 2015 Eran Rosenthal
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU Affero General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/

function detectWeb(doc, url) {
if (ZU.xpathText(doc, '//header//h1')) {
return "magazineArticle";
}
}

function doWeb(doc, url) {
newItem = new Zotero.Item("magazineArticle");
newItem.url = url;
newItem.publicationTitle = "TheMarker";

newItem.title = ZU.xpathText(doc, '//header//h1');

var abstract = ZU.xpathText(doc, '//header/p');
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content');
newItem.abstractNote = abstract;

newItem.date = ZU.xpathText(doc, '//time[@itemprop="datePublished"]/@datetime').split('T')[0];

var authors = ZU.xpath(doc, '//address/a[@rel="author"]');
for(var i=0; i<authors.length; i++) {
newItem.creators.push(ZU.cleanAuthor(authors[i].textContent, 'author'));
}
var keywords = ZU.xpathText(doc, '//head/meta[@name="news_keywords"]/@content').split(',');
for(var i=0; i<keywords.length; i++) {
if(keywords[i].length>0) newItem.tags.push(keywords[i].trim());
}

newItem.attachments = [{
document: doc,
title: "TheMarker"
}];

newItem.complete();
}

/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.themarker.com/markerweek/thisweek/1.2707370",
"items": [
{
"itemType": "magazineArticle",
"title": "השופט גרוסקופף מציג: הבובות של נוחי דנקנר",
"creators": [
{
"firstName": "גיא",
"lastName": "רולניק",
"creatorType": "author"
}
],
"date": "2015-08-15",
"abstractNote": "כאשר במשק יש ריכוזי כוח כלכלי ופוליטי, לאיש אין עניין לצעוק שהמלך הוא עירום, ורוב האנשים יעדיפו לשכנע את עצמם שאלה בגדי המלך החדשים והיפים",
"libraryCatalog": "TheMarker",
"publicationTitle": "TheMarker",
"shortTitle": "השופט גרוסקופף מציג",
"url": "http://www.themarker.com/markerweek/thisweek/1.2707370",
"attachments": [
{
"title": "TheMarker"
}
],
"tags": [
"ריכוזיות"
],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/
90 changes: 90 additions & 0 deletions Ynet.js
@@ -0,0 +1,90 @@
{
"translatorID": "7f45c3f9-e387-4589-9679-225ddcf6f00e",
"label": "Ynet",
"creator": "Eran Rosenthal",
"target": "^https?://www\\.ynet\\.co\\.il/articles/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-08-15 17:25:20"
}

/**
Copyright (c) 2015 Eran Rosenthal
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU Affero General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/

function detectWeb(doc, url) {
return 'newspaperArticle';
}
function doWeb(doc, url) {
var item = new Zotero.Item('newspaperArticle');

item.title = ZU.xpathText(doc, '//meta[@property="og:title"]/@content');
item.publicationTitle = 'Ynet';
item.url = url;
item.language = 'he';
var abstract = ZU.xpathText(doc, '//div[@class="art_header_sub_title"]');
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content');
item.abstractNote = abstract;

var author = ZU.xpathText(doc, '//div[@class="art_header_footer"]//a');
if (author) {
item.creators.push(Zotero.Utilities.cleanAuthor(author, 'author'));
}

var kakyDate = ZU.xpathText(doc, '//meta[@property="vr:published_time"]/@content');
var dateSplit = /([0-9]{2})\.([0-9]{2})\.([0-9]{2})$/.exec(kakyDate);
if(dateSplit) {
// it is tricky but should work
item.date = ['20'+dateSplit[3], dateSplit[2], dateSplit[1]].join('-');
}
item.complete();
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.ynet.co.il/articles/0,7340,L-4690772,00.html",
"items": [
{
"itemType": "newspaperArticle",
"title": "תעלומת הקצין מארה\"ב, הסודות והמאהבת",
"creators": [
{
"firstName": "ירון",
"lastName": "דרוקמן",
"creatorType": "author"
}
],
"date": "2015-08-15",
"abstractNote": "הכותרות בישרו השבוע לפני 15 שנים על פרשת ריגול מהסרטים: אמריקני-יהודי שהתגייר, בא לישראל עם חומר סודי ביותר ומטריף את הממשל מדאגה. חברתו העידה שהסתובב עם פאה ושפם והתקשר אליה מטלפונים ציבוריים לסניף הדואר. בסוף מצאו אותו במצפה רמון, והאמת התבררה. בערך",
"language": "he",
"libraryCatalog": "Ynet",
"publicationTitle": "Ynet",
"url": "http://www.ynet.co.il/articles/0,7340,L-4690772,00.html",
"attachments": [],
"tags": [],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/

0 comments on commit 911d25e

Please sign in to comment.