Skip to content

Commit

Permalink
Update bot filtering for webrequests.
Browse files Browse the repository at this point in the history
Rename is_crawler to isSpider to more coherent with data tagging.
Update spiders matching function with better regexp and WikimediaBot removal.
Add a function matching WikimediaBot and associated UDF.
Update and add tests.

Bug: T108598
Change-Id: I3b468050b613c1e97d87b782cbfd90c9fdc433b8
  • Loading branch information
jobar authored and ottomata committed Sep 11, 2015
1 parent 346bd5d commit 41d57df
Show file tree
Hide file tree
Showing 10 changed files with 323 additions and 22 deletions.
Expand Up @@ -49,12 +49,23 @@ public static Webrequest getInstance(){
public static final String REFERER_EXTERNAL = "external";

/*
* Now back to the good part.
* Wikimedia-specific crawlers
* Spiders identification pattern (obvisouly not perfect...)
* to be used in addition to ua-parser device_family field
* being identified as Spider.
*/
private static final Pattern crawlerPattern = Pattern.compile(
"(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot).*"
);
private static final Pattern spiderPattern = Pattern.compile("(?i)^(" +
".*(bot|spider|WordPress|AppEngine|AppleDictionaryService|Python-urllib|python-requests|" +
"Google-HTTP-Java-Client|[Ff]acebook|[Yy]ahoo|RockPeaks).*" +
"|(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot|Java/|curl|PHP/|Faraday|HTTPC|Ruby|\\.NET|" +
"Python|Apache|Scrapy|PycURL|libwww|Zend|wget|nodemw|WinHttpRaw|Twisted|com\\.eusoft|Lagotto|" +
"Peggo|Recuweb|check_http|Magnus|MLD|Jakarta|find-link|J\\. River|projectplan9|ADmantX|" +
"httpunit|LWP|iNaturalist|WikiDemo|FSResearchIt|livedoor|Microsoft Monitoring|MediaWiki).*" +
")$");

/*
* WikimediaBot identification pattern
*/
private static final Pattern wikimediaBotPattern = Pattern.compile("\\bWikimediaBot\\b");

/**
* Pattern for automatically-added subdomains that indicate zero,
Expand All @@ -74,13 +85,34 @@ public static Webrequest getInstance(){
);

/**
* Identify Wikimedia-specific crawlers; returns TRUE
* if the user agent matches a known crawler.
* Identify a bunch of spiders; returns TRUE
* if the user agent matches a known spider and doesn't
* match the WikimediaBot convention.
* @param userAgent the user agent associated with the request.
* @return boolean
*/
public boolean isSpider(String userAgent) {
if ("-".equals(userAgent))
return true;
else
return spiderPattern.matcher(userAgent).find() && ! wikimediaBotPattern.matcher(userAgent).find();
}
/**
* Kept for backward compatibility.
*/
@Deprecated
public boolean isCrawler(String userAgent) {
return crawlerPattern.matcher(userAgent).find();
return isSpider(userAgent);
}

/**
* Identify WikimediaBot; returns TRUE
* if the user agent matches the WikimediaBot convention.
* @param userAgent the user agent associated with the request.
* @return boolean
*/
public boolean isWikimediaBot(String userAgent) {
return wikimediaBotPattern.matcher(userAgent).find();
}

/**
Expand Down
Expand Up @@ -13,27 +13,73 @@
@RunWith(JUnitParamsRunner.class)
public class TestWebrequest {

@Deprecated
@Test
@FileParameters(
value = "src/test/resources/isCrawler_test_data.csv",
value = "src/test/resources/isSpider_test_data.csv",
mapper = CsvWithHeaderMapper.class
)

public void testIsCrawler(
String test_description,
boolean is_crawler,
boolean is_WikimediaBot,
String user_agent
) {
Webrequest webrequest_inst = Webrequest.getInstance();
assertEquals(
test_description,
is_crawler,
webrequest_inst.isCrawler(
user_agent
)
);
}

@Test
@FileParameters(
value = "src/test/resources/isSpider_test_data.csv",
mapper = CsvWithHeaderMapper.class
)

public void testisCrawler(
public void testIsSpider(
String test_description,
boolean is_crawler,
boolean isSpider,
boolean isWikimediaBot,
String user_agent
) {
Webrequest webrequest_inst = Webrequest.getInstance();
assertEquals(
test_description,
is_crawler,
webrequest_inst.isCrawler(
isSpider,
webrequest_inst.isSpider(
user_agent
)
);
}

@Test
@FileParameters(
value = "src/test/resources/isSpider_test_data.csv",
mapper = CsvWithHeaderMapper.class
)

public void testIsWikimediabot(
String test_description,
boolean isSpider,
boolean isWikimediaBot,
String user_agent
) {
Webrequest webrequest_inst = Webrequest.getInstance();
assertEquals(
test_description,
isWikimediaBot,
webrequest_inst.isWikimediaBot(
user_agent
)
);
}

@Test
@FileParameters(
value = "src/test/resources/x_analytics_test_data.csv",
Expand Down
7 changes: 0 additions & 7 deletions refinery-core/src/test/resources/isCrawler_test_data.csv

This file was deleted.

58 changes: 58 additions & 0 deletions refinery-core/src/test/resources/isSpider_test_data.csv
@@ -0,0 +1,58 @@
test_description, isSpider, isWikimediaBot, user_agent
is spider - Google, true,false,MediaWikiCrawler-Google/2.0 (+wikidata-external@google.com)
is spider – goo.ne.jp, true,false,goo wikipedia (http://help.goo.ne.jp/contact/)
is spider - bin bot, true, false,Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
is spider - dash, true, false,-
is spider - google bot, true, false,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
is spider - yahoo bot, true, false,Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)
is spider - peachy bot, true, false,Peachy MediaWiki Bot API Version 2.0 (alpha 8)
is spider - google bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML; like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
is spider - baidu bot, true, false,Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
is spider - yandex bot, true, false,Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
is spider - wikiwix bot, true, false,wikiwix-bot-3.0
is spider - java 8 unknown bot, true, false,Java/1.8.0_51
is spider - bing bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML; like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)
is spider - apple dictinnary bot, true, false,AppleDictionaryService/229
is spider - php wikibot, true, false,php wikibot classes
is spider - MS Search bot, true, false,Mozilla/4.0 (compatible; MSIE 4.01; Windows NT; MS Search 6.0 Robot)
is spider - Python unknown bot, true, false,python-requests/2.7.0 CPython/3.4.2 Linux/3.16.0-4-amd64
is spider - searchmetrics bot, true, false,Mozilla/5.0 (compatible; SearchmetricsBot; http://www.searchmetrics.com/en/searchmetrics-bot/)
is spider - facebook external hit, true, false,facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)
is spider - apple dictinnary bot, true, false,AppleDictionaryService/229.1
is spider - cliqzbot, true, false,Mozilla/5.0 (compatible; Cliqzbot/1.0 +http://cliqz.com/company/cliqzbot)
is spider - apple bot, true, false,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML; like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1; +http://www.apple.com/go/applebot)
is spider - java 8 unknown bot, true, false,Java/1.8.0_25
is spider - DotNetWikiBot, true, false,DotNetWikiBot/2.101 (Microsoft Windows NT 6.2.9200.0; .NET CLR 4.0.30319.34209)
is spider - Pywikibot, true, false,wymowa (commons:commons; User:Alkamid) Pywikibot/2.0b3 (g3) requests/2.7.0 Python/3.4.0.final.0
is spider - msn media bot, true, false,msnbot-media/1.1 (+http://search.msn.com/msnbot.htm)
is spider - youdaobot, true, false,Mozilla/5.0 (compatible; YoudaoBot/1.0; http://www.youdao.com/help/webmaster/spider/; )
is spider - java 8 unknown bot, true, false,Java/1.8.0_40
is spider - java 6 unknown bot, true, false,Java/1.6.0_20
is spider - java 8 unknown bot, true, false,Java/1.8.0_45
is spider - Python unknown bot, true, false,Python-urllib/2.7
is spider - java 7 unknown bot, true, false,Java/1.7.0_67
is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/Img/2.0; +http://go.mail.ru/help/robots)
is spider - java 7 unknown bot, true, false,Java/1.7.0_79
is spider - RBot, true, false,RBot/0.3 (underdog@wolfhome.com)
is spider - Pywikipediabot, true, false,pywikipedia-git-wdlabel.py/r581 Pywikipediabot/1.0
is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)
is spider - sogou bot, true, false,Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)
is spider - java 7 unknown bot, true, false,Java/1.7.0_65
is spider - taxonbot, true, false,TaxonBot@de.wikipedia <animalia@gmx.net> – MediaWiki Tcl Bot Framework 0.5
is spider - apple dictionnary bot, true, false,AppleDictionaryService/208
is spider - ClueBot, true, false,ClueBot/1.1
is spider - Unknown bot, true, false,Mozilla/5.0 (MyMemory Bot http://mymemory.traslated.net/doc/)
is spider - baidu image bot, true, false,Baiduspider-image+(+http://www.baidu.com/search/spider.htm)
is spider - Pywikipediabot, true, false,pywikipedia-addzumra.py/rg11224 Pywikipediabot/1.0 Unknown
is spider - yeti bot, true, false,Mozilla/5.0 (compatible; Yeti/1.1; +http://help.naver.com/robots/)
is spider - Pywikipediabot, true, false,pwb/rg3113 Pywikipediabot/2.0
is spider - exabot, true, false,Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)
is spider - Python unknown bot, true, false,Python-urllib/2.6
is spider - yacybot, true, false,yacybot (/global; amd64 Linux 3.13.0-63-generic; java 1.7.0_80; Europe/en) http://yacy.net/bot.html
is spider - Pywikibot, true, false,maj_articles_recents (wikipedia:fr; User:Z%C3%A9roBot) Pywikibot/2.0b3 (g4795) httplib2/0.7.2 Python/2.7.3.final.0
is spider - Pywikibot, true, false,CategorieAutoriPer (wikisource:it; User:CandalBot) Pywikibot/2.0b3 (g5671) requests/2.0.0 Python/2.7.3.final.0
is spider - curl bot, true, false,curl/7.35.0
Is Not spider - firefox, false,false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko
Is Not spider - iphone, false,false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53
Is Not spider - opera, false,false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10
Is Not spider - WikimediaBot, false,true,Whatever UA info containing WikimediaBot should match.
Expand Up @@ -23,13 +23,14 @@
* A hive UDF to identify Wikimedia-specific crawlers,
* which ua-parser misses (for obvious reasons)
*/
@Deprecated
public class IsCrawlerUDF extends UDF {
public boolean evaluate(
String user_agent
) {
Webrequest webrequest_inst = Webrequest.getInstance();
return webrequest_inst.isCrawler(
user_agent
user_agent
);
}
}
@@ -0,0 +1,35 @@
/**
* Copyright (C) 2014 Wikimedia Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.wikimedia.analytics.refinery.hive;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.wikimedia.analytics.refinery.core.Webrequest;

/**
* A hive UDF to identify spiders,
* which ua-parser misses (for obvious reasons)
*/
public class IsSpiderUDF extends UDF {
public boolean evaluate(
String user_agent
) {
Webrequest webrequest_inst = Webrequest.getInstance();
return webrequest_inst.isSpider(
user_agent
);
}
}
@@ -0,0 +1,34 @@
/**
* Copyright (C) 2014 Wikimedia Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.wikimedia.analytics.refinery.hive;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.wikimedia.analytics.refinery.core.Webrequest;

/**
* A hive UDF to identify WikimediaBot.
*/
public class IsWikimediaBotUDF extends UDF {
public boolean evaluate(
String user_agent
) {
Webrequest webrequest_inst = Webrequest.getInstance();
return webrequest_inst.isWikimediaBot(
user_agent
);
}
}
Expand Up @@ -23,17 +23,19 @@
import junitparams.JUnitParamsRunner;
import junitparams.mappers.CsvWithHeaderMapper;

@Deprecated
@RunWith(JUnitParamsRunner.class)
public class TestIsCrawlerUDF {

@Test
@FileParameters(
value = "../refinery-core/src/test/resources/isCrawler_test_data.csv",
value = "../refinery-core/src/test/resources/isSpider_test_data.csv",
mapper = CsvWithHeaderMapper.class
)
public void testIsCrawler(
String test_description,
boolean is_crawler,
boolean is_wikimediaBot,
String user_agent
) {
IsCrawlerUDF udf = new IsCrawlerUDF();
Expand Down
@@ -0,0 +1,50 @@
/**
* Copyright (C) 2014 Wikimedia Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.wikimedia.analytics.refinery.hive;

import junitparams.FileParameters;
import junitparams.JUnitParamsRunner;
import junitparams.mappers.CsvWithHeaderMapper;
import org.junit.Test;
import org.junit.runner.RunWith;

import static org.junit.Assert.assertEquals;

@RunWith(JUnitParamsRunner.class)
public class TestIsSpiderUDF {

@Test
@FileParameters(
value = "../refinery-core/src/test/resources/isSpider_test_data.csv",
mapper = CsvWithHeaderMapper.class
)
public void testIsCrawler(
String test_description,
boolean isSpider,
boolean isWikimediaBot,
String user_agent
) {
IsSpiderUDF udf = new IsSpiderUDF();

assertEquals(
test_description,
isSpider,
udf.evaluate(
user_agent
)
);
}
}

0 comments on commit 41d57df

Please sign in to comment.