Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #938 from eranroz/master
Adding scrappers for Haaretz, TheMarker and Ynet
- Loading branch information
Showing
3 changed files
with
342 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
{ | ||
"translatorID": "d6f64d96-aa6f-4fd3-816f-bdef842c7088", | ||
"label": "Haaretz", | ||
"creator": "Eran Rosenthal", | ||
"target": "^https?://www\\.haaretz\\.(co\\.il|com)/", | ||
"minVersion": "3.0", | ||
"maxVersion": "", | ||
"priority": 100, | ||
"inRepository": true, | ||
"translatorType": 4, | ||
"browserSupport": "gcsibv", | ||
"lastUpdated": "2015-08-17 22:01:10" | ||
} | ||
|
||
/** | ||
Copyright (c) 2015 Eran Rosenthal | ||
This program is free software: you can redistribute it and/or | ||
modify it under the terms of the GNU Affero General Public License | ||
as published by the Free Software Foundation, either version 3 of | ||
the License, or (at your option) any later version. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
Affero General Public License for more details. | ||
You should have received a copy of the GNU Affero General Public | ||
License along with this program. If not, see | ||
<http://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
function detectWeb(doc, url) { | ||
if (ZU.xpathText(doc, '//header//h1')) { | ||
return 'newspaperArticle'; | ||
} | ||
} | ||
|
||
function doWeb(doc, url) { | ||
var item = new Zotero.Item('newspaperArticle'); | ||
item.title = ZU.xpathText(doc, '//header//h1'); | ||
item.url = url; | ||
if(url.indexOf('haaretz.com') != -1) { | ||
item.publicationTitle = 'Haaretz'; | ||
item.language = 'en'; | ||
} else{ | ||
item.publicationTitle = 'הארץ'; | ||
item.language = 'he'; | ||
} | ||
|
||
var abstract = ZU.xpathText(doc, '//header/p'); | ||
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content'); | ||
item.abstractNote = abstract; | ||
|
||
var authors = ZU.xpath(doc, '//address/a[@rel="author"]'); | ||
for(var i=0; i<authors.length; i++) { | ||
item.creators.push(ZU.cleanAuthor(authors[i].textContent, 'author')); | ||
} | ||
|
||
item.date = ZU.xpathText(doc, '//time[@itemprop="datePublished"]/@datetime').split('T')[0]; | ||
var keywords = ZU.xpathText(doc, '//head/meta[@name="news_keywords"]/@content').split(','); | ||
for(var i=0; i<keywords.length; i++) { | ||
if(keywords[i].length>0) item.tags.push(keywords[i].trim()); | ||
} | ||
item.complete(); | ||
} | ||
/** BEGIN TEST CASES **/ | ||
var testCases = [ | ||
{ | ||
"type": "web", | ||
"url": "http://www.haaretz.com/news/diplomacy-defense/1.671202", | ||
"items": [ | ||
{ | ||
"itemType": "newspaperArticle", | ||
"title": "Islamic Jihad: If Hunger Striker Dies, We'll Respond With Force Against Israel", | ||
"creators": [ | ||
{ | ||
"firstName": "Jack", | ||
"lastName": "Khoury", | ||
"creatorType": "author" | ||
}, | ||
{ | ||
"firstName": "Shirly", | ||
"lastName": "Seidler", | ||
"creatorType": "author" | ||
}, | ||
{ | ||
"firstName": "Ido", | ||
"lastName": "Efrati", | ||
"creatorType": "author" | ||
} | ||
], | ||
"date": "2015-08-14", | ||
"abstractNote": "Islamic Jihad says it will no longer be committed to maintaining calm if Mohammed Allaan, who lost consciousness after 60-day hunger strike, dies.", | ||
"language": "en", | ||
"libraryCatalog": "Haaretz", | ||
"publicationTitle": "Haaretz", | ||
"shortTitle": "Islamic Jihad", | ||
"url": "http://www.haaretz.com/news/diplomacy-defense/1.671202", | ||
"attachments": [], | ||
"tags": [ | ||
"Palestinian hunger strike" | ||
], | ||
"notes": [], | ||
"seeAlso": [] | ||
} | ||
] | ||
}, | ||
{ | ||
"type": "web", | ||
"url": "http://www.haaretz.co.il/news/politics/1.2708080", | ||
"items": [ | ||
{ | ||
"itemType": "newspaperArticle", | ||
"title": "פלסטיני דקר חייל ופצע אותו באורח קל בכביש 443 סמוך לבית חורון", | ||
"creators": [ | ||
{ | ||
"firstName": "גילי", | ||
"lastName": "כהן", | ||
"creatorType": "author" | ||
}, | ||
{ | ||
"firstName": "עמירה", | ||
"lastName": "הס", | ||
"creatorType": "author" | ||
} | ||
], | ||
"date": "2015-08-15", | ||
"abstractNote": "כוח צה\"ל שהיה במקום פתח באש לעבר הפלסטיני ופצע אותו באורח קל, והוא נעצר. החייל והדוקר פונו לבית החולים שערי צדק. בתחילת השבוע נדקר באזור צעיר ישראלי נוסף שנפצע בינוני", | ||
"language": "he", | ||
"libraryCatalog": "Haaretz", | ||
"publicationTitle": "הארץ", | ||
"url": "http://www.haaretz.co.il/news/politics/1.2708080", | ||
"attachments": [], | ||
"tags": [ | ||
"טרור", | ||
"פיגוע", | ||
"פלסטינים", | ||
"צה\"ל" | ||
], | ||
"notes": [], | ||
"seeAlso": [] | ||
} | ||
] | ||
} | ||
] | ||
/** END TEST CASES **/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
{ | ||
"translatorID": "b2d61bb5-5b21-41b7-9c83-1abcbf14639b", | ||
"label": "TheMarker", | ||
"creator": "Eran Rosenthal", | ||
"target": "^https?://www\\.themarker\\.com/", | ||
"minVersion": "3.0", | ||
"maxVersion": "", | ||
"priority": 100, | ||
"inRepository": true, | ||
"translatorType": 4, | ||
"browserSupport": "gcsibv", | ||
"lastUpdated": "2015-08-17 22:04:49" | ||
} | ||
|
||
/** | ||
Copyright (c) 2015 Eran Rosenthal | ||
This program is free software: you can redistribute it and/or | ||
modify it under the terms of the GNU Affero General Public License | ||
as published by the Free Software Foundation, either version 3 of | ||
the License, or (at your option) any later version. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
Affero General Public License for more details. | ||
You should have received a copy of the GNU Affero General Public | ||
License along with this program. If not, see | ||
<http://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
function detectWeb(doc, url) { | ||
if (ZU.xpathText(doc, '//header//h1')) { | ||
return "magazineArticle"; | ||
} | ||
} | ||
|
||
function doWeb(doc, url) { | ||
newItem = new Zotero.Item("magazineArticle"); | ||
newItem.url = url; | ||
newItem.publicationTitle = "TheMarker"; | ||
|
||
newItem.title = ZU.xpathText(doc, '//header//h1'); | ||
|
||
var abstract = ZU.xpathText(doc, '//header/p'); | ||
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content'); | ||
newItem.abstractNote = abstract; | ||
|
||
newItem.date = ZU.xpathText(doc, '//time[@itemprop="datePublished"]/@datetime').split('T')[0]; | ||
|
||
var authors = ZU.xpath(doc, '//address/a[@rel="author"]'); | ||
for(var i=0; i<authors.length; i++) { | ||
newItem.creators.push(ZU.cleanAuthor(authors[i].textContent, 'author')); | ||
} | ||
var keywords = ZU.xpathText(doc, '//head/meta[@name="news_keywords"]/@content').split(','); | ||
for(var i=0; i<keywords.length; i++) { | ||
if(keywords[i].length>0) newItem.tags.push(keywords[i].trim()); | ||
} | ||
|
||
newItem.attachments = [{ | ||
document: doc, | ||
title: "TheMarker" | ||
}]; | ||
|
||
newItem.complete(); | ||
} | ||
|
||
/** BEGIN TEST CASES **/ | ||
var testCases = [ | ||
{ | ||
"type": "web", | ||
"url": "http://www.themarker.com/markerweek/thisweek/1.2707370", | ||
"items": [ | ||
{ | ||
"itemType": "magazineArticle", | ||
"title": "השופט גרוסקופף מציג: הבובות של נוחי דנקנר", | ||
"creators": [ | ||
{ | ||
"firstName": "גיא", | ||
"lastName": "רולניק", | ||
"creatorType": "author" | ||
} | ||
], | ||
"date": "2015-08-15", | ||
"abstractNote": "כאשר במשק יש ריכוזי כוח כלכלי ופוליטי, לאיש אין עניין לצעוק שהמלך הוא עירום, ורוב האנשים יעדיפו לשכנע את עצמם שאלה בגדי המלך החדשים והיפים", | ||
"libraryCatalog": "TheMarker", | ||
"publicationTitle": "TheMarker", | ||
"shortTitle": "השופט גרוסקופף מציג", | ||
"url": "http://www.themarker.com/markerweek/thisweek/1.2707370", | ||
"attachments": [ | ||
{ | ||
"title": "TheMarker" | ||
} | ||
], | ||
"tags": [ | ||
"ריכוזיות" | ||
], | ||
"notes": [], | ||
"seeAlso": [] | ||
} | ||
] | ||
} | ||
] | ||
/** END TEST CASES **/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
{ | ||
"translatorID": "7f45c3f9-e387-4589-9679-225ddcf6f00e", | ||
"label": "Ynet", | ||
"creator": "Eran Rosenthal", | ||
"target": "^https?://www\\.ynet\\.co\\.il/articles/", | ||
"minVersion": "3.0", | ||
"maxVersion": "", | ||
"priority": 100, | ||
"inRepository": true, | ||
"translatorType": 4, | ||
"browserSupport": "gcsibv", | ||
"lastUpdated": "2015-08-15 17:25:20" | ||
} | ||
|
||
/** | ||
Copyright (c) 2015 Eran Rosenthal | ||
This program is free software: you can redistribute it and/or | ||
modify it under the terms of the GNU Affero General Public License | ||
as published by the Free Software Foundation, either version 3 of | ||
the License, or (at your option) any later version. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
Affero General Public License for more details. | ||
You should have received a copy of the GNU Affero General Public | ||
License along with this program. If not, see | ||
<http://www.gnu.org/licenses/>. | ||
*/ | ||
|
||
function detectWeb(doc, url) { | ||
return 'newspaperArticle'; | ||
} | ||
function doWeb(doc, url) { | ||
var item = new Zotero.Item('newspaperArticle'); | ||
|
||
item.title = ZU.xpathText(doc, '//meta[@property="og:title"]/@content'); | ||
item.publicationTitle = 'Ynet'; | ||
item.url = url; | ||
item.language = 'he'; | ||
var abstract = ZU.xpathText(doc, '//div[@class="art_header_sub_title"]'); | ||
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content'); | ||
item.abstractNote = abstract; | ||
|
||
var author = ZU.xpathText(doc, '//div[@class="art_header_footer"]//a'); | ||
if (author) { | ||
item.creators.push(Zotero.Utilities.cleanAuthor(author, 'author')); | ||
} | ||
|
||
var kakyDate = ZU.xpathText(doc, '//meta[@property="vr:published_time"]/@content'); | ||
var dateSplit = /([0-9]{2})\.([0-9]{2})\.([0-9]{2})$/.exec(kakyDate); | ||
if(dateSplit) { | ||
// it is tricky but should work | ||
item.date = ['20'+dateSplit[3], dateSplit[2], dateSplit[1]].join('-'); | ||
} | ||
item.complete(); | ||
} | ||
/** BEGIN TEST CASES **/ | ||
var testCases = [ | ||
{ | ||
"type": "web", | ||
"url": "http://www.ynet.co.il/articles/0,7340,L-4690772,00.html", | ||
"items": [ | ||
{ | ||
"itemType": "newspaperArticle", | ||
"title": "תעלומת הקצין מארה\"ב, הסודות והמאהבת", | ||
"creators": [ | ||
{ | ||
"firstName": "ירון", | ||
"lastName": "דרוקמן", | ||
"creatorType": "author" | ||
} | ||
], | ||
"date": "2015-08-15", | ||
"abstractNote": "הכותרות בישרו השבוע לפני 15 שנים על פרשת ריגול מהסרטים: אמריקני-יהודי שהתגייר, בא לישראל עם חומר סודי ביותר ומטריף את הממשל מדאגה. חברתו העידה שהסתובב עם פאה ושפם והתקשר אליה מטלפונים ציבוריים לסניף הדואר. בסוף מצאו אותו במצפה רמון, והאמת התבררה. בערך", | ||
"language": "he", | ||
"libraryCatalog": "Ynet", | ||
"publicationTitle": "Ynet", | ||
"url": "http://www.ynet.co.il/articles/0,7340,L-4690772,00.html", | ||
"attachments": [], | ||
"tags": [], | ||
"notes": [], | ||
"seeAlso": [] | ||
} | ||
] | ||
} | ||
] | ||
/** END TEST CASES **/ |