In [83]:
import pandas as pd
from wmfdata import hive
from wmfdata.charting import set_mpl_style
from wmfdata.utils import pd_display_all

In [84]:
set_mpl_style()

# Extracting KaiOS data from webrequest

In [74]:
hive.run("""
CREATE EXTERNAL TABLE `neilpquinn.kaios_wp_webrequest`(
  `hostname` string COMMENT 'Source node hostname', 
  `sequence` bigint COMMENT 'Per host sequence number', 
  `dt` string COMMENT 'Timestame at cache in ISO 8601', 
  `time_firstbyte` double COMMENT 'Time to first byte', 
  `ip` string COMMENT 'IP of packet at cache', 
  `cache_status` string COMMENT 'Cache status', 
  `http_status` string COMMENT 'HTTP status of response', 
  `response_size` bigint COMMENT 'Response size', 
  `http_method` string COMMENT 'HTTP method of request', 
  `uri_host` string COMMENT 'Host of request', 
  `uri_path` string COMMENT 'Path of request', 
  `uri_query` string COMMENT 'Query of request', 
  `content_type` string COMMENT 'Content-Type header of response', 
  `referer` string COMMENT 'Referer header of request', 
  `x_forwarded_for` string COMMENT 'X-Forwarded-For header of request', 
  `user_agent` string COMMENT 'User-Agent header of request', 
  `accept_language` string COMMENT 'Accept-Language header of request', 
  `x_analytics` string COMMENT 'X-Analytics header of response', 
  `range` string COMMENT 'Range header of response', 
  `is_pageview` boolean COMMENT 'Indicates if this record was marked as a pageview during refinement', 
  `record_version` string COMMENT 'Keeps track of changes in the table content definition - https://wikitech.wikimedia.org/wiki/Analytics/Data/Webrequest', 
  `client_ip` string COMMENT 'Client IP computed during refinement using ip and x_forwarded_for', 
  `geocoded_data` map<string,string> COMMENT 'Geocoded map with continent, country_code, country, city, subdivision, postal_code, latitude, longitude, timezone keys and associated values', 
  `x_cache` string COMMENT 'X-Cache header of response', 
  `user_agent_map` map<string,string> COMMENT 'User-agent map with browser_family, browser_major, device_family, os_family, os_major, os_minor and wmf_app_version keys and associated values', 
  `x_analytics_map` map<string,string> COMMENT 'X_analytics map view of the x_analytics field', 
  `ts` timestamp COMMENT 'Unix timestamp in milliseconds extracted from dt', 
  `access_method` string COMMENT 'Method used to access the site (mobile app|mobile web|desktop)', 
  `agent_type` string COMMENT 'Categorise the agent making the webrequest as either user or spider (automatas to be added).', 
  `is_zero` boolean COMMENT 'Indicates if the webrequest is accessed through a zero provider', 
  `referer_class` string COMMENT 'Indicates if a referer is internal, external or unknown.', 
  `normalized_host` struct<project_class:string,project:string,qualifiers:array<string>,tld:string,project_family:string> COMMENT 'struct containing project_family (such as wikipedia or wikidata for instance), project (such as en or commons), qualifiers (a list of in-between values, such as m and/or zero) and tld (org most often)', 
  `pageview_info` map<string,string> COMMENT 'map containing project, language_variant and page_title values only when is_pageview = TRUE.', 
  `page_id` int COMMENT 'MediaWiki page_id for this page title. For redirects this could be the page_id of the redirect or the page_id of the target. This may not always be set, even if the page is actually a pageview.', 
  `namespace_id` int COMMENT 'MediaWiki namespace_id for this page title. This may not always be set, even if the page is actually a pageview.', 
  `tags` array<string> COMMENT 'List containing tags qualifying the request, ex: [portal, wikidata]. Will be used to split webrequest into smaller subsets.', 
  `isp_data` map<string,string> COMMENT 'Internet Service Provider data in a map with keys isp, organization, autonomous_system_organization and autonomous_system_number', 
  `accept` string COMMENT 'Accept header of request', 
  `tls` string COMMENT 'TLS information of request', 
  `tls_map` map<string,string> COMMENT 'Map view of TLS information (keys are vers, keyx, auth and ciph)',
  `webrequest_source` string COMMENT 'Source cluster' 
)
PARTITIONED BY ( 
  `year` int COMMENT 'Unpadded year of request', 
  `month` int COMMENT 'Unpadded month of request', 
  `day` int COMMENT 'Unpadded day of request' 
)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
""")

In [None]:
for day in range(1, 31):
  hive.run(
    """
    INSERT INTO TABLE neilpquinn.kaios_wp_webrequest
    PARTITION (year = 2019, month = 10, day = {day})
    SELECT
      hostname,
      sequence,
      dt,
      time_firstbyte,
      ip,
      cache_status,
      http_status,
      response_size,
      http_method,
      uri_host,
      uri_path,
      uri_query,
      content_type,
      referer,
      x_forwarded_for,
      user_agent,
      accept_language,
      x_analytics,
      range,
      is_pageview,
      record_version,
      client_ip,
      geocoded_data,
      x_cache,
      user_agent_map,
      x_analytics_map,
      ts,
      access_method,
      agent_type,
      is_zero,
      referer_class,
      normalized_host,
      pageview_info,
      page_id,
      namespace_id,
      tags,
      isp_data,
      accept,
      tls,
      tls_map,
      webrequest_source
    FROM wmf.webrequest
    WHERE
      LOWER(user_agent) LIKE "%kaios%" AND
      normalized_host.project_family = "wikipedia" AND
      year = 2019 AND month = 10 AND day = {day}
    """.format(day=day)
  )

# Geographic location

In [99]:
# Adapted from https://github.com/wikimedia/analytics-refinery/blob/master/oozie/unique_devices/per_project_family/monthly/unique_devices_per_project_family_monthly.hql
unique_devices_by_country = hive.run([
"ADD JAR hdfs:///wmf/refinery/current/artifacts/refinery-hive.jar",
"""
CREATE TEMPORARY FUNCTION is_redirect_to_pageview
AS 'org.wikimedia.analytics.refinery.hive.IsRedirectToPageviewUDF'
""",
"""
WITH last_access_dates AS (
  SELECT
    geocoded_data['country'] AS country,
    unix_timestamp(x_analytics_map['WMF-Last-Access-Global'], 'dd-MMM-yyyy') AS last_access_global,
    x_analytics_map['nocookies'] AS nocookies,
    ip,
    user_agent,
    accept_language
  FROM neilpquinn.kaios_wp_webrequest
  WHERE
    x_analytics_map IS NOT NULL AND
    agent_type = 'user' AND
    (
      is_pageview OR
      is_redirect_to_pageview(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics)
    ) AND
    webrequest_source = 'text' AND
     -- mandatory partition predicate
    year > 0
),
fresh_sessions_aggregated AS (
  SELECT
    country,
    COUNT(1) AS uniques_offset
  FROM (
    SELECT
      HASH(ip, user_agent, accept_language) AS id,
      country,
      SUM(CASE WHEN (nocookies IS NOT NULL) THEN 1 ELSE 0 END)
    FROM
      last_access_dates
    GROUP BY
      hash(ip, user_agent, accept_language),
      country
    -- Only keeping clients having done 1 event without cookies
    HAVING SUM(CASE WHEN (nocookies IS NOT NULL) THEN 1 ELSE 0 END) = 1
  ) fresh_sessions
  GROUP BY country
)
SELECT
  COALESCE(la.country, fresh.country) AS country,
  SUM(
    CASE
      -- last-access-global not set and client accept cookies --> first visit, count
      WHEN (
        la.last_access_global IS NULL AND 
        la.nocookies is NULL
      ) THEN 1
      -- last-access-global set and its date is before month start --> First visit today, count
      WHEN (
        la.last_access_global IS NOT NULL AND
        la.last_access_global < unix_timestamp('2019-10-01', 'yyyy-MM-dd')
      ) THEN 1
      -- Other cases, don't count
      ELSE 0
    END
  ) + COALESCE(fresh.uniques_offset, 0) AS uniques_estimate
FROM last_access_dates AS la
FULL OUTER JOIN fresh_sessions_aggregated AS fresh
ON la.country = fresh.country
GROUP BY
  COALESCE(la.country, fresh.country),
  COALESCE(fresh.uniques_offset, 0)
"""])

In [106]:
unique_devices_by_country.sort_values("uniques_estimate", ascending=False).head(10)

Unnamed: 0,country,uniques_estimate
64,India,6775679
161,United States,14775
107,Nigeria,2289
27,Canada,1283
157,Uganda,1131
166,Vietnam,606
139,South Africa,576
126,Russia,560
160,United Kingdom,548
111,Pakistan,541


In [108]:
pageviews_by_country = hive.run("""
SELECT
  geocoded_data["country"] AS country,
  COUNT(1) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  -- mandatory partition predicate
  year > 0
GROUP BY geocoded_data["country"]
""")

In [111]:
pageviews_by_country.sort_values("pageviews", ascending=False).head(10)

Unnamed: 0,country,pageviews
75,India,19442435
37,United States,137757
41,South Africa,125846
165,Nigeria,25880
71,Ireland,12148
13,Canada,11377
150,Uganda,10718
67,Rwanda,5457
93,Cameroon,4123
47,United Kingdom,3917


In [114]:
indian_pageviews_by_state = hive.run("""
SELECT
  geocoded_data["subdivision"] AS state,
  COUNT(1) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  geocoded_data["country"] = "India" AND
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  -- mandatory partition predicate
  year > 0
GROUP BY geocoded_data["subdivision"]
ORDER BY pageviews DESC
LIMIT 100000
""")

In [116]:
indian_pageviews_by_state.head(10)

Unnamed: 0,state,pageviews
0,Uttar Pradesh,2396848
1,Madhya Pradesh,2213992
2,Maharashtra,2176419
3,Bihar,1912137
4,West Bengal,1750605
5,Telangana,1355029
6,National Capital Territory of Delhi,1294277
7,Rajasthan,1229904
8,Karnataka,914367
9,Tamil Nadu,890136


In [125]:
overall_indian_pageviews = hive.run("""
SELECT
  subdivision AS state,
  SUM(view_count) AS total_pageviews
FROM wmf.pageview_hourly
WHERE
  year = 2019 AND month = 10 AND day < 31 AND
  access_method = "mobile web" AND
  country = "India"
GROUP BY subdivision
ORDER BY total_pageviews DESC
LIMIT 100000
""")

In [132]:
(
  indian_pageviews_by_state
  .merge(overall_indian_pageviews, on="state")
  .rename({"pageviews": "kaios_pageviews", "total_pageviews": "pageviews"}, axis=1)
  .assign(kaios_view_prop=lambda df: df["kaios_pageviews"] / df["pageviews"])
  .sort_values("state")
  [["state", "kaios_view_prop"]]
)

Unnamed: 0,state,kaios_view_prop
34,Andaman and Nicobar,0.000136
24,Andhra Pradesh,0.000521
31,Arunachal Pradesh,0.000326
12,Assam,0.035307
3,Bihar,0.075069
22,Chandigarh,0.00686
20,Chhattisgarh,0.024812
35,Dadra and Nagar Haveli,0.007692
33,Daman and Diu,0.000694
30,Goa,9.8e-05


# Wikipedia editions

In [None]:
hive.run("""
SELECT
  language_name as language,
  wiki_lang_part,
  pageviews
FROM (
  SELECT
    normalized_host.project as wiki_lang_part,
    COUNT(1) as pageviews
  FROM neilpquinn.kaios_wp_webrequest
  WHERE
    geocoded_data["country"] = "India" AND
    agent_type = 'user' AND
    is_pageview AND
    webrequest_source = 'text' AND
    -- mandatory partition predicate
    year > 0
  GROUP BY normalized_host.project 
) pv
LEFT JOIN canonical_data.wikis
ON domain_name = CONCAT(wiki_lang_part, ".wikipedia.org")
""")

In [255]:
_144[["language", "pageviews"]].sort_values("pageviews", ascending=False).head(10)

Unnamed: 0,language,pageviews
196,English,12527951
141,Hindi,5584717
38,Marathi,245482
277,Bangla,222975
264,Tamil,152967
209,Telugu,139351
72,Maithili,67463
197,Spanish,43769
189,Russian,34061
137,Kannada,33628


# Referrers

In [134]:
hive.run("""
SELECT
  referer_class,
  COUNT(1) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  geocoded_data["country"] = "India" AND
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  -- mandatory partition predicate
  year > 0
GROUP BY referer_class
""")

Unnamed: 0,referer_class,pageviews
0,internal,4213360
1,none,388196
2,external,200023
3,external (search engine),14640856


In [235]:
external_referrers = hive.run("""
SELECT
  PARSE_URL(referer, "HOST") AS referring_domain,
  COUNT(*) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  geocoded_data["country"] = "India" AND
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  referer_class LIKE "%external%" AND
  -- mandatory partition predicate
  year > 0
GROUP BY PARSE_URL(referer, "HOST")
HAVING pageviews > 1000
ORDER BY pageviews DESC
LIMIT 1000000
""")

In [236]:
external_referrers

Unnamed: 0,referring_domain,pageviews
0,www.google.com,14587390
1,googleweblight.com,186152
2,www.google.co.in,36562
3,www.bing.com,6426
4,r.search.yahoo.com,4675
5,duckduckgo.com,3898
6,m.youtube.com,3136
7,support.google.com,1181
8,www.netfind.com,1094


In [273]:
internal_referrers = hive.run("""
SELECT
  PARSE_URL(referer, "HOST") AS referring_domain,
  COUNT(*) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  geocoded_data["country"] = "India" AND
  agent_type = 'user' AND`
  is_pageview AND
  webrequest_source = 'text' AND
  referer_class = "internal" AND
  -- mandatory partition predicate
  year > 0
GROUP BY PARSE_URL(referer, "HOST")
HAVING pageviews > 1000
ORDER BY pageviews DESC
LIMIT 1000000
""")

In [275]:
internal_referrers.head()

Unnamed: 0,referring_domain,pageviews
0,en.m.wikipedia.org,2178723
1,hi.m.wikipedia.org,930978
2,www.wikipedia.org,755820
3,mr.m.wikipedia.org,50671
4,bn.m.wikipedia.org,48827


In [282]:
inter_wp_referrers = hive.run("""
SELECT
  PARSE_URL(referer, "HOST") AS referring_domain,
  uri_host AS destination_domain,
  COUNT(1) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  referer_class = "internal" AND
  PARSE_URL(referer, "HOST") LIKE "%.wikipedia.org" AND
  PARSE_URL(referer, "HOST") != uri_host AND
  geocoded_data["country"] = "India" AND
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  -- mandatory partition predicate
  year > 0
GROUP BY
  PARSE_URL(referer, "HOST"),
  uri_host
HAVING pageviews > 1000
ORDER BY pageviews DESC
LIMIT 1000000
""")

In [283]:
inter_wp_referrers

Unnamed: 0,referring_domain,destination_domain,pageviews
0,www.wikipedia.org,en.m.wikipedia.org,369211
1,www.wikipedia.org,hi.m.wikipedia.org,171802
2,www.wikipedia.org,es.m.wikipedia.org,23115
3,www.wikipedia.org,ru.m.wikipedia.org,18322
4,www.wikipedia.org,de.m.wikipedia.org,16907
5,www.wikipedia.org,ja.m.wikipedia.org,16068
6,www.wikipedia.org,fr.m.wikipedia.org,13636
7,hi.m.wikipedia.org,hi.wikipedia.org,12952
8,www.wikipedia.org,mr.m.wikipedia.org,12881
9,www.wikipedia.org,pl.m.wikipedia.org,12454


# User agents

In [245]:
user_agents_r = hive.run("""
SELECT
  user_agent,
  COUNT(*) as pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  -- mandatory partition predicate
  year > 0
GROUP BY user_agent
HAVING pageviews > 1000
ORDER BY pageviews DESC
LIMIT 1000000
""")

In [246]:
user_agents = user_agents_r.copy()

likely_device_names = (
  user_agents
  .replace("\\$", "", regex=True)
  .replace(" LYF_F211S_[\\w-]*", "LYF/F211S", regex=True)
  .replace([
    "Mozilla/5.0\\s*",
    "Mobile;*\\s*",
    "Android;*\\s*",
    "rv:\\s*\\d{1,2}\\.\\d{1};*\\s*",
    "Firefox/\\d{1,2}\\.\\d{1}\\s*",
    "Gecko/\\d{1,2}\\.\\d{1}\\s*",
    "\\(",
    ";*\\s*\\)",
    "(KaiOS|KAIOS)/\\d{1,2}\\.\\d{1}\\.*\\d*\\.*\\d*",
    "/LYF[_-][\\w-]*"
  ], "", regex=True)
  .replace("_", " ", regex=True)
  ["user_agent"]
)

user_agents.insert(1, "likely_device_name", likely_device_names)

user_agents.head()

Unnamed: 0,user_agent,likely_device_name,pageviews
0,Mozilla/5.0 (Mobile; LYF/F220B/LYF-F220B-003-0...,LYF/F220B,10558958
1,Mozilla/5.0 (Mobile; LYF/F120B/LYF-F120B-001-0...,LYF/F120B,1934786
2,Mozilla/5.0 (Mobile; LYF/F90M/LYF_F90M_000-03-...,LYF/F90M,1400049
3,Mozilla/5.0 (Mobile; LYF/F271i/LYF_F271i-000-0...,LYF/F271i,981820
4,Mozilla/5.0 (Mobile; LYF/F221S/LYF_F221S_000-0...,LYF/F221S,454797


# Top pages

In [267]:
top_pages = hive.run("""
SELECT
  CONCAT(pageview_info["project"], ".org") AS site,
  pageview_info["page_title"] AS page,
  COUNT(*) AS pageviews
FROM neilpquinn.kaios_wp_webrequest
WHERE
  geocoded_data["country"] = "India" AND
  agent_type = 'user' AND
  is_pageview AND
  webrequest_source = 'text' AND
  -- mandatory partition predicate
  year > 0
GROUP BY pageview_info["project"], pageview_info["page_title"]
HAVING pageviews > 1000
ORDER BY pageviews DESC
LIMIT 1000000
""")

In [272]:
top_pages.head(10)

Unnamed: 0,site,page,pageviews
0,en.wikipedia.org,TikTok,710443
1,en.wikipedia.org,Special:Search,303028
2,en.wikipedia.org,Main_Page,279367
3,en.wikipedia.org,Gadar:_Ek_Prem_Katha,178909
4,en.wikipedia.org,Joseph_Plateau,147573
5,hi.wikipedia.org,कुमकुम_भाग्य,145918
6,hi.wikipedia.org,Special:Search,124303
7,hi.wikipedia.org,मुखपृष्ठ,113249
8,en.wikipedia.org,Karan_Arjun,108926
9,en.wikipedia.org,File:Tiktok_logo.svg,107125
