Skip to content

Commit

Permalink
Patch pageview definition
Browse files Browse the repository at this point in the history
Modify maven pom.xml files not to build uber jars and to take advantage of properties for versions.
Include xx.mobile.xxx.org and xx.wap.xxx.org.
Update referer classification to output a string instead of a map.
Add getProject function and UDF to identify pageview requests.
Correct little bugs in test.

Change-Id: Id3b14d954d1396a8e8667d6865a854ad1167d830
  • Loading branch information
jobar committed Apr 24, 2015
1 parent cd88e45 commit cc0b6ed
Show file tree
Hide file tree
Showing 23 changed files with 359 additions and 290 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ release.properties
*.iml
*.ipr
*.iws
out/
9 changes: 8 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
## v0.0.9-SNAPSHOT
## v0.0.10-SNAPSHOT
* Maven now builds non-uber jars by having hadoop and hive in provided scope.
It also takes advantage of properties to propagate version numbers.
* PageView Class has a function to extract project from uri.
Bugs have been corrected on how to handle mobile uri.
* Referer classification now outputs a string instead of a map.

## v0.0.9
* Generic functions used in multiple classes now live in a single "utilities" class.
* Pageview and LegacyPageview have been renamed to PageviewDefinition and
LegacyPageviewDefinition, respectively. These also should now use the
Expand Down
50 changes: 7 additions & 43 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -121,24 +121,6 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.5.0-cdh5.3.1</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.5.0-cdh5.3.1</version>
</dependency>

<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>0.13.1-cdh5.3.1</version>
</dependency>

<dependency>
<groupId>ua_parser</groupId>
<artifactId>ua-parser</artifactId>
Expand Down Expand Up @@ -175,32 +157,10 @@
<version>2.0.29</version>
</dependency>


<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.10.0</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.2.0-cdh5.3.1</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.10</artifactId>
<version>1.2.0-cdh5.3.1</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.2.0-cdh5.3.1</version>
<scope>provided</scope>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.5.2</version>
</dependency>

</dependencies>
Expand Down Expand Up @@ -310,6 +270,10 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<skip.tests>false</skip.tests>
<java.version>1.7</java.version>
<hadoop.version>2.5.0-cdh5.3.1</hadoop.version>
<hive.version>0.13.1-cdh5.3.1</hive.version>
<scala.version>2.10.4</scala.version>
<spark.version>1.2.0-cdh5.3.1</spark.version>
</properties>

</project>
16 changes: 12 additions & 4 deletions refinery-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@
<packaging>jar</packaging>

<dependencies>
<dependency>
<!--<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>-->

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
Expand Down Expand Up @@ -66,6 +70,10 @@
<artifactId>json-simple</artifactId>
</dependency>

<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,20 @@ public static PageviewDefinition getInstance(){
* Now back to the good part.
*/
private final Pattern uriHostWikimediaDomainPattern = Pattern.compile(
"(commons|meta|incubator|species)\\." // any of these domain names
+ "((m|mobile|wap|zero)\\.)?" // followed by an optional mobile or zero qualifier
+ "wikimedia\\.org$" // ending with wikimedia.org
"(commons|meta|incubator|species|outreach)\\." // any of these domain names
+ "((m|mobile|wap|zero)\\.)?" // followed by an optional mobile or zero qualifier
+ "wikimedia\\.org$" // ending with wikimedia.org
);

private final Pattern uriHostProjectDomainPattern = Pattern.compile(
"(?<!www)\\." // not starting with "www"
"(?<!(www\\.|test))" // not starting with "www." or "test"
+ "(wik(ibooks|" // match project domains ending in .org
+ "inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$"
);

private final Pattern uriHostOtherProjectsPattern = Pattern.compile(
"(wikidata|mediawiki)\\.org$"
"(?<!test)" // not starting with "test"
+ "(wikidata|mediawiki|wikimediafoundation)\\.org$" // match project domains ending in .org
);

private final Pattern uriPathPattern = Pattern.compile(
Expand Down Expand Up @@ -96,6 +97,15 @@ public static PageviewDefinition getInstance(){
"304"
));

private final HashSet<String> uriPortionsToRemove = new HashSet<String>(Arrays.asList(
"m",
"mobile",
"wap",
"zero",
"www",
"download"
));

/**
* All API request uriPaths will contain this
*/
Expand Down Expand Up @@ -193,4 +203,41 @@ public boolean isPageview(
&& !Utilities.patternIsFound(uriQueryUnwantedActions, uriQuery)
);
}
}

/**
* Identifies a project from a pageview uriHost
* NOTE: Provides correct result only if used with is_pageview = true
*
* @param uriHost The url's host
* @return The project identifier in format [xxx.]xxxx (en.wikipedia or wikisource for instance)
*/
public String getProjectFromHost(String uriHost) {
if (uriHost == null) return "-";
String[] uri_parts = uriHost.toLowerCase().split("\\.");
switch (uri_parts.length) {
// case wikixxx.org
case 2:
return uri_parts[0];
//case xx.wikixxx.org - Remove unwanted parts
case 3:
if (uriPortionsToRemove.contains(uri_parts[0]))
return uri_parts[1];
else
return uri_parts[0] + "." + uri_parts[1];
//xx.[m|mobile|wap|zero].wikixxx.org - Remove unwanted parts
case 4:
if (uriPortionsToRemove.contains(uri_parts[0]))
return uri_parts[2];
else
return uri_parts[0] + "." + uri_parts[2];
//xx.[m|mobile|wap|zero].[m|mobile|wap|zero].wikixxx.org - Remove unwanted parts
case 5:
if (uriPortionsToRemove.contains(uri_parts[0]))
return uri_parts[3];
else
return uri_parts[0] + "." + uri_parts[3];
default:
return "-";
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ public static Webrequest getInstance(){
return instance;
}

/*
* Constant string results for referer classification
*/
public static final String REFERER_UNKNOWN = "unknown";
public static final String REFERER_INTERNAL = "internal";
public static final String REFERER_EXTERNAL = "external";

/*
* Now back to the good part.
* Wikimedia-specific crawlers
Expand All @@ -54,7 +61,7 @@ public static Webrequest getInstance(){
* or some similar portal-based interface to MW.
*/
private static final Pattern uriHostPattern = Pattern.compile(
"\\.(m|zero)\\."
"(^(m|zero|wap|mobile)\\.)|(\\.(m|zero|wap|mobile)\\.)"
);

/**
Expand Down Expand Up @@ -130,55 +137,40 @@ public String getAccessMethod(String uriHost, String userAgent) {
return accessMethod;
}

/**
* Classification for referers
* <p>
* <ul>
* <li>A referer from a WMF domain translates into “internal”.</li>
* <li>A referer from a non-WMF domain translates into “external".</li>
* <li>An empty or invalid refer translates into “unknown".</li>
* </ul>
*/
public enum RefererClassification {
UNKNOWN,
INTERNAL,
EXTERNAL
}

/**
* Classifies a referer
*
* @param url The referer url to classify
* @return RefererClassification
*/
public static RefererClassification classify(String url) {
public String classifyReferer(String url) {
if (url == null || url.isEmpty() || url.equals("-")) {
return RefererClassification.UNKNOWN;
return REFERER_UNKNOWN;
}

String[] urlParts = StringUtils.splitPreserveAllTokens(url, '/');
if (urlParts == null || urlParts.length <3) {
return RefererClassification.UNKNOWN;
return REFERER_UNKNOWN;
}

if (!urlParts[0].equals("http:") && !urlParts[0].equals("https:")) {
return RefererClassification.UNKNOWN;
return REFERER_UNKNOWN;
}

if (!urlParts[1].isEmpty()) {
return RefererClassification.UNKNOWN;
return REFERER_UNKNOWN;
}

String[] domainParts = StringUtils.splitPreserveAllTokens(urlParts[2], '.');

if (domainParts == null || domainParts.length <2) {
return RefererClassification.UNKNOWN;
return REFERER_UNKNOWN;
}

if (domainParts[domainParts.length-1].equals("org")) {
switch (domainParts[domainParts.length-2]) {
case "":
return RefererClassification.UNKNOWN;
return REFERER_UNKNOWN;
case "mediawiki":
case "wikibooks":
case "wikidata":
Expand All @@ -191,9 +183,12 @@ public static RefererClassification classify(String url) {
case "wikiversity":
case "wikivoyage":
case "wiktionary":
return RefererClassification.INTERNAL;
return REFERER_INTERNAL;
}
}
return RefererClassification.EXTERNAL;
return REFERER_EXTERNAL;
}



}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ public class TestLegacyPageviewDefinition {
)
public void testIsLegacyPageview(
String test_description,
String project,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public class TestPageview {
)
public void testIsPageview(
String test_description,
String project,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
Expand Down Expand Up @@ -66,6 +67,7 @@ public void testIsPageview(
)
public void testIsAppPageview(
String test_description,
String project,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
Expand All @@ -83,11 +85,39 @@ public void testIsAppPageview(
test_description,
is_app_pageview,
PageviewDefinitionInstance.isAppPageview(
uri_path,
uri_query,
content_type,
user_agent
uri_path,
uri_query,
content_type,
user_agent
)
);
}

@Test
@FileParameters(
value = "src/test/resources/pageview_test_data.csv",
mapper = CsvWithHeaderMapper.class
)
public void testGetProjectFromHost(
String test_description,
String project,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
String ip_address,
String x_forwarded_for,
String uri_host,
String uri_path,
String uri_query,
String http_status,
String content_type,
String user_agent
) {
PageviewDefinition PageviewDefinitionInstance = PageviewDefinition.getInstance();
assertEquals(
test_description,
project,
PageviewDefinitionInstance.getProjectFromHost(uri_host)
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ public void testMatchingOfMostPopularUA(String uaString, String jsonMapResult) t


// decode expected output and turn it into an object
System.out.println(jsonMapResult);
Object obj = jsonParser.parse(jsonMapResult);
JSONObject expected_ua = (JSONObject) obj;

Expand Down
Loading

0 comments on commit cc0b6ed

Please sign in to comment.