Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update bot filtering for webrequests.
Rename is_crawler to isSpider to more coherent with data tagging. Update spiders matching function with better regexp and WikimediaBot removal. Add a function matching WikimediaBot and associated UDF. Update and add tests. Bug: T108598 Change-Id: I3b468050b613c1e97d87b782cbfd90c9fdc433b8
- Loading branch information
Showing
10 changed files
with
323 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
test_description, isSpider, isWikimediaBot, user_agent | ||
is spider - Google, true,false,MediaWikiCrawler-Google/2.0 (+wikidata-external@google.com) | ||
is spider – goo.ne.jp, true,false,goo wikipedia (http://help.goo.ne.jp/contact/) | ||
is spider - bin bot, true, false,Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) | ||
is spider - dash, true, false,- | ||
is spider - google bot, true, false,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) | ||
is spider - yahoo bot, true, false,Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp) | ||
is spider - peachy bot, true, false,Peachy MediaWiki Bot API Version 2.0 (alpha 8) | ||
is spider - google bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML; like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) | ||
is spider - baidu bot, true, false,Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html) | ||
is spider - yandex bot, true, false,Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) | ||
is spider - wikiwix bot, true, false,wikiwix-bot-3.0 | ||
is spider - java 8 unknown bot, true, false,Java/1.8.0_51 | ||
is spider - bing bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML; like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm) | ||
is spider - apple dictinnary bot, true, false,AppleDictionaryService/229 | ||
is spider - php wikibot, true, false,php wikibot classes | ||
is spider - MS Search bot, true, false,Mozilla/4.0 (compatible; MSIE 4.01; Windows NT; MS Search 6.0 Robot) | ||
is spider - Python unknown bot, true, false,python-requests/2.7.0 CPython/3.4.2 Linux/3.16.0-4-amd64 | ||
is spider - searchmetrics bot, true, false,Mozilla/5.0 (compatible; SearchmetricsBot; http://www.searchmetrics.com/en/searchmetrics-bot/) | ||
is spider - facebook external hit, true, false,facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php) | ||
is spider - apple dictinnary bot, true, false,AppleDictionaryService/229.1 | ||
is spider - cliqzbot, true, false,Mozilla/5.0 (compatible; Cliqzbot/1.0 +http://cliqz.com/company/cliqzbot) | ||
is spider - apple bot, true, false,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML; like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1; +http://www.apple.com/go/applebot) | ||
is spider - java 8 unknown bot, true, false,Java/1.8.0_25 | ||
is spider - DotNetWikiBot, true, false,DotNetWikiBot/2.101 (Microsoft Windows NT 6.2.9200.0; .NET CLR 4.0.30319.34209) | ||
is spider - Pywikibot, true, false,wymowa (commons:commons; User:Alkamid) Pywikibot/2.0b3 (g3) requests/2.7.0 Python/3.4.0.final.0 | ||
is spider - msn media bot, true, false,msnbot-media/1.1 (+http://search.msn.com/msnbot.htm) | ||
is spider - youdaobot, true, false,Mozilla/5.0 (compatible; YoudaoBot/1.0; http://www.youdao.com/help/webmaster/spider/; ) | ||
is spider - java 8 unknown bot, true, false,Java/1.8.0_40 | ||
is spider - java 6 unknown bot, true, false,Java/1.6.0_20 | ||
is spider - java 8 unknown bot, true, false,Java/1.8.0_45 | ||
is spider - Python unknown bot, true, false,Python-urllib/2.7 | ||
is spider - java 7 unknown bot, true, false,Java/1.7.0_67 | ||
is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/Img/2.0; +http://go.mail.ru/help/robots) | ||
is spider - java 7 unknown bot, true, false,Java/1.7.0_79 | ||
is spider - RBot, true, false,RBot/0.3 (underdog@wolfhome.com) | ||
is spider - Pywikipediabot, true, false,pywikipedia-git-wdlabel.py/r581 Pywikipediabot/1.0 | ||
is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots) | ||
is spider - sogou bot, true, false,Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07) | ||
is spider - java 7 unknown bot, true, false,Java/1.7.0_65 | ||
is spider - taxonbot, true, false,TaxonBot@de.wikipedia <animalia@gmx.net> – MediaWiki Tcl Bot Framework 0.5 | ||
is spider - apple dictionnary bot, true, false,AppleDictionaryService/208 | ||
is spider - ClueBot, true, false,ClueBot/1.1 | ||
is spider - Unknown bot, true, false,Mozilla/5.0 (MyMemory Bot http://mymemory.traslated.net/doc/) | ||
is spider - baidu image bot, true, false,Baiduspider-image+(+http://www.baidu.com/search/spider.htm) | ||
is spider - Pywikipediabot, true, false,pywikipedia-addzumra.py/rg11224 Pywikipediabot/1.0 Unknown | ||
is spider - yeti bot, true, false,Mozilla/5.0 (compatible; Yeti/1.1; +http://help.naver.com/robots/) | ||
is spider - Pywikipediabot, true, false,pwb/rg3113 Pywikipediabot/2.0 | ||
is spider - exabot, true, false,Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot) | ||
is spider - Python unknown bot, true, false,Python-urllib/2.6 | ||
is spider - yacybot, true, false,yacybot (/global; amd64 Linux 3.13.0-63-generic; java 1.7.0_80; Europe/en) http://yacy.net/bot.html | ||
is spider - Pywikibot, true, false,maj_articles_recents (wikipedia:fr; User:Z%C3%A9roBot) Pywikibot/2.0b3 (g4795) httplib2/0.7.2 Python/2.7.3.final.0 | ||
is spider - Pywikibot, true, false,CategorieAutoriPer (wikisource:it; User:CandalBot) Pywikibot/2.0b3 (g5671) requests/2.0.0 Python/2.7.3.final.0 | ||
is spider - curl bot, true, false,curl/7.35.0 | ||
Is Not spider - firefox, false,false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko | ||
Is Not spider - iphone, false,false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 | ||
Is Not spider - opera, false,false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10 | ||
Is Not spider - WikimediaBot, false,true,Whatever UA info containing WikimediaBot should match. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
35 changes: 35 additions & 0 deletions
35
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
/** | ||
* Copyright (C) 2014 Wikimedia Foundation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.wikimedia.analytics.refinery.hive; | ||
|
||
import org.apache.hadoop.hive.ql.exec.UDF; | ||
import org.wikimedia.analytics.refinery.core.Webrequest; | ||
|
||
/** | ||
* A hive UDF to identify spiders, | ||
* which ua-parser misses (for obvious reasons) | ||
*/ | ||
public class IsSpiderUDF extends UDF { | ||
public boolean evaluate( | ||
String user_agent | ||
) { | ||
Webrequest webrequest_inst = Webrequest.getInstance(); | ||
return webrequest_inst.isSpider( | ||
user_agent | ||
); | ||
} | ||
} |
34 changes: 34 additions & 0 deletions
34
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/** | ||
* Copyright (C) 2014 Wikimedia Foundation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.wikimedia.analytics.refinery.hive; | ||
|
||
import org.apache.hadoop.hive.ql.exec.UDF; | ||
import org.wikimedia.analytics.refinery.core.Webrequest; | ||
|
||
/** | ||
* A hive UDF to identify WikimediaBot. | ||
*/ | ||
public class IsWikimediaBotUDF extends UDF { | ||
public boolean evaluate( | ||
String user_agent | ||
) { | ||
Webrequest webrequest_inst = Webrequest.getInstance(); | ||
return webrequest_inst.isWikimediaBot( | ||
user_agent | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 50 additions & 0 deletions
50
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/** | ||
* Copyright (C) 2014 Wikimedia Foundation | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.wikimedia.analytics.refinery.hive; | ||
|
||
import junitparams.FileParameters; | ||
import junitparams.JUnitParamsRunner; | ||
import junitparams.mappers.CsvWithHeaderMapper; | ||
import org.junit.Test; | ||
import org.junit.runner.RunWith; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
@RunWith(JUnitParamsRunner.class) | ||
public class TestIsSpiderUDF { | ||
|
||
@Test | ||
@FileParameters( | ||
value = "../refinery-core/src/test/resources/isSpider_test_data.csv", | ||
mapper = CsvWithHeaderMapper.class | ||
) | ||
public void testIsCrawler( | ||
String test_description, | ||
boolean isSpider, | ||
boolean isWikimediaBot, | ||
String user_agent | ||
) { | ||
IsSpiderUDF udf = new IsSpiderUDF(); | ||
|
||
assertEquals( | ||
test_description, | ||
isSpider, | ||
udf.evaluate( | ||
user_agent | ||
) | ||
); | ||
} | ||
} |
Oops, something went wrong.