New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding scrappers for Haaretz, TheMarker and Ynet #938

Merged
merged 1 commit into from Aug 17, 2015
Jump to file or symbol
Failed to load files and symbols.
+342 −0
Diff settings

Always

Just for now

View
@@ -0,0 +1,147 @@
{
"translatorID": "d6f64d96-aa6f-4fd3-816f-bdef842c7088",
"label": "Haaretz",
"creator": "Eran Rosenthal",
"target": "^https?://www\\.haaretz\\.(co\\.il|com)/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-08-17 22:01:10"
}
/**
Copyright (c) 2015 Eran Rosenthal
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU Affero General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/
function detectWeb(doc, url) {
if (ZU.xpathText(doc, '//header//h1')) {
return 'newspaperArticle';
}
}
function doWeb(doc, url) {
var item = new Zotero.Item('newspaperArticle');
item.title = ZU.xpathText(doc, '//header//h1');
item.url = url;
if(url.indexOf('haaretz.com') != -1) {
item.publicationTitle = 'Haaretz';
item.language = 'en';
} else{
item.publicationTitle = 'הארץ';
item.language = 'he';
}
var abstract = ZU.xpathText(doc, '//header/p');
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content');
item.abstractNote = abstract;
var authors = ZU.xpath(doc, '//address/a[@rel="author"]');
for(var i=0; i<authors.length; i++) {
item.creators.push(ZU.cleanAuthor(authors[i].textContent, 'author'));
}
item.date = ZU.xpathText(doc, '//time[@itemprop="datePublished"]/@datetime').split('T')[0];
var keywords = ZU.xpathText(doc, '//head/meta[@name="news_keywords"]/@content').split(',');
for(var i=0; i<keywords.length; i++) {
if(keywords[i].length>0) item.tags.push(keywords[i].trim());
}
item.complete();
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.haaretz.com/news/diplomacy-defense/1.671202",
"items": [
{
"itemType": "newspaperArticle",
"title": "Islamic Jihad: If Hunger Striker Dies, We'll Respond With Force Against Israel",
"creators": [
{
"firstName": "Jack",
"lastName": "Khoury",
"creatorType": "author"
},
{
"firstName": "Shirly",
"lastName": "Seidler",
"creatorType": "author"
},
{
"firstName": "Ido",
"lastName": "Efrati",
"creatorType": "author"
}
],
"date": "2015-08-14",
"abstractNote": "Islamic Jihad says it will no longer be committed to maintaining calm if Mohammed Allaan, who lost consciousness after 60-day hunger strike, dies.",
"language": "en",
"libraryCatalog": "Haaretz",
"publicationTitle": "Haaretz",
"shortTitle": "Islamic Jihad",
"url": "http://www.haaretz.com/news/diplomacy-defense/1.671202",
"attachments": [],
"tags": [
"Palestinian hunger strike"
],
"notes": [],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "http://www.haaretz.co.il/news/politics/1.2708080",
"items": [
{
"itemType": "newspaperArticle",
"title": "פלסטיני דקר חייל ופצע אותו באורח קל בכביש 443 סמוך לבית חורון",
"creators": [
{
"firstName": "גילי",
"lastName": "כהן",
"creatorType": "author"
},
{
"firstName": "עמירה",
"lastName": "הס",
"creatorType": "author"
}
],
"date": "2015-08-15",
"abstractNote": "כוח צה\"ל שהיה במקום פתח באש לעבר הפלסטיני ופצע אותו באורח קל, והוא נעצר. החייל והדוקר פונו לבית החולים שערי צדק. בתחילת השבוע נדקר באזור צעיר ישראלי נוסף שנפצע בינוני",
"language": "he",
"libraryCatalog": "Haaretz",
"publicationTitle": "הארץ",
"url": "http://www.haaretz.co.il/news/politics/1.2708080",
"attachments": [],
"tags": [
"טרור",
"פיגוע",
"פלסטינים",
"צה\"ל"
],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/
View
@@ -0,0 +1,105 @@
{
"translatorID": "b2d61bb5-5b21-41b7-9c83-1abcbf14639b",
"label": "TheMarker",
"creator": "Eran Rosenthal",
"target": "^https?://www\\.themarker\\.com/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-08-17 22:04:49"
}
/**
Copyright (c) 2015 Eran Rosenthal
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU Affero General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/
function detectWeb(doc, url) {
if (ZU.xpathText(doc, '//header//h1')) {
return "magazineArticle";
}
}
function doWeb(doc, url) {
newItem = new Zotero.Item("magazineArticle");
newItem.url = url;
newItem.publicationTitle = "TheMarker";
newItem.title = ZU.xpathText(doc, '//header//h1');
var abstract = ZU.xpathText(doc, '//header/p');
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content');
newItem.abstractNote = abstract;
newItem.date = ZU.xpathText(doc, '//time[@itemprop="datePublished"]/@datetime').split('T')[0];
var authors = ZU.xpath(doc, '//address/a[@rel="author"]');
for(var i=0; i<authors.length; i++) {
newItem.creators.push(ZU.cleanAuthor(authors[i].textContent, 'author'));
}
var keywords = ZU.xpathText(doc, '//head/meta[@name="news_keywords"]/@content').split(',');
for(var i=0; i<keywords.length; i++) {
if(keywords[i].length>0) newItem.tags.push(keywords[i].trim());
}
newItem.attachments = [{
document: doc,
title: "TheMarker"
}];
newItem.complete();
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.themarker.com/markerweek/thisweek/1.2707370",
"items": [
{
"itemType": "magazineArticle",
"title": "השופט גרוסקופף מציג: הבובות של נוחי דנקנר",
"creators": [
{
"firstName": "גיא",
"lastName": "רולניק",
"creatorType": "author"
}
],
"date": "2015-08-15",
"abstractNote": "כאשר במשק יש ריכוזי כוח כלכלי ופוליטי, לאיש אין עניין לצעוק שהמלך הוא עירום, ורוב האנשים יעדיפו לשכנע את עצמם שאלה בגדי המלך החדשים והיפים",
"libraryCatalog": "TheMarker",
"publicationTitle": "TheMarker",
"shortTitle": "השופט גרוסקופף מציג",
"url": "http://www.themarker.com/markerweek/thisweek/1.2707370",
"attachments": [
{
"title": "TheMarker"
}
],
"tags": [
"ריכוזיות"
],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/
View
90 Ynet.js
@@ -0,0 +1,90 @@
{
"translatorID": "7f45c3f9-e387-4589-9679-225ddcf6f00e",
"label": "Ynet",
"creator": "Eran Rosenthal",
"target": "^https?://www\\.ynet\\.co\\.il/articles/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2015-08-15 17:25:20"
}
/**
Copyright (c) 2015 Eran Rosenthal
This program is free software: you can redistribute it and/or
modify it under the terms of the GNU Affero General Public License
as published by the Free Software Foundation, either version 3 of
the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public
License along with this program. If not, see
<http://www.gnu.org/licenses/>.
*/
function detectWeb(doc, url) {
return 'newspaperArticle';
}
function doWeb(doc, url) {
var item = new Zotero.Item('newspaperArticle');
item.title = ZU.xpathText(doc, '//meta[@property="og:title"]/@content');
item.publicationTitle = 'Ynet';
item.url = url;
item.language = 'he';
var abstract = ZU.xpathText(doc, '//div[@class="art_header_sub_title"]');
if (!abstract) abstract = ZU.xpathText(doc, '//meta[@property="og:description"]/@content');
item.abstractNote = abstract;
var author = ZU.xpathText(doc, '//div[@class="art_header_footer"]//a');
if (author) {
item.creators.push(Zotero.Utilities.cleanAuthor(author, 'author'));
}
var kakyDate = ZU.xpathText(doc, '//meta[@property="vr:published_time"]/@content');
var dateSplit = /([0-9]{2})\.([0-9]{2})\.([0-9]{2})$/.exec(kakyDate);
if(dateSplit) {
// it is tricky but should work
item.date = ['20'+dateSplit[3], dateSplit[2], dateSplit[1]].join('-');
}
item.complete();
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://www.ynet.co.il/articles/0,7340,L-4690772,00.html",
"items": [
{
"itemType": "newspaperArticle",
"title": "תעלומת הקצין מארה\"ב, הסודות והמאהבת",
"creators": [
{
"firstName": "ירון",
"lastName": "דרוקמן",
"creatorType": "author"
}
],
"date": "2015-08-15",
"abstractNote": "הכותרות בישרו השבוע לפני 15 שנים על פרשת ריגול מהסרטים: אמריקני-יהודי שהתגייר, בא לישראל עם חומר סודי ביותר ומטריף את הממשל מדאגה. חברתו העידה שהסתובב עם פאה ושפם והתקשר אליה מטלפונים ציבוריים לסניף הדואר. בסוף מצאו אותו במצפה רמון, והאמת התבררה. בערך",
"language": "he",
"libraryCatalog": "Ynet",
"publicationTitle": "Ynet",
"url": "http://www.ynet.co.il/articles/0,7340,L-4690772,00.html",
"attachments": [],
"tags": [],
"notes": [],
"seeAlso": []
}
]
}
]
/** END TEST CASES **/
ProTip! Use n and p to navigate between commits in a pull request.