In [1]:
from os import environ
import json
from pyspark.sql import SparkSession
from pathlib import Path
import pandas as pd
import yaml

environ[
    'PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("obscene_queries") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()

In [2]:
sc = session.sparkContext
sc

In [3]:
# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus")
# queries_dir = corpus_dir / "queries-2023-02-14"
corpus_dir = Path(
    "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus")
queries_dir = corpus_dir / "queries-2023-02-16"
queries_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus/queries-2023-02-16')

In [4]:
figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [5]:
def adapt_query(query: dict) -> tuple:
    query_text = query["url_query"].lower()
    query_text = query_text.replace('+', ' ').replace('-', ' ').replace('|', ' ').replace('_', ' ')
    query_text = query_text.replace('[^\w\s]','')
    return query["service"],   query["language"], query_text

In [6]:
# Get all queries to construct a  set of query terms
queries = sc.textFile(f"file://{queries_dir}") \
    .map(lambda line: json.loads(line)) \
    .filter(lambda query: query["url_query"] is not None) \
    .keyBy(lambda query: query["id"]) \
    .mapValues(adapt_query)

In [7]:
# Use list of obscene words (https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words)
with open("obscene_words.json", "r") as file:
    obscene_words = json.load(file)

In [9]:
def check_obscenity(row: tuple, obscene_words: dict) -> tuple:
    language = row[1][1]
    words = obscene_words[language] if language in obscene_words else obscene_words["en"] 
    obscene = False
    tokens = row[1][2].split()
    if any(token in words for token in tokens):
        obscene = True
    return *row, obscene

In [10]:
# Check all queries for obscenity in their respective language (and english as fallback)
checked_queries = queries \
    .map(lambda row: check_obscenity(row, obscene_words)) \
    .countByValue()

In [11]:
df = pd.DataFrame([
    {
        "service": service,
        "language": language,
        "query": query,
        "obscene": obscene,
        "count": count
    }
    for (_, (service, language, query), obscene), count in checked_queries.items()
])

In [12]:
yaml_path = "../data/selected-services.yaml"
with open(yaml_path, "r") as file:
    yaml_list = yaml.safe_load(file)
    
category_dict = {}
for elem in yaml_list:
    category_dict[elem["name"]] = elem["category"]

In [13]:
# TODO: Standardize naming
df.loc[df["service"] == "search", "service"] = "search-ch"
df.loc[df["service"] == "mail.ru", "service"] = "mail"
df["category"] = df.apply(lambda row: category_dict[row["service"]], axis=1)

In [14]:
non_obscene_service = df.loc[~df["obscene"]].groupby("service")["count"].sum()
obscene_service = df.loc[df["obscene"]].groupby("service")["count"].sum()

service_df = pd.concat([obscene_service.rename("obscene"), non_obscene_service.rename("non-obscene")], join="outer", axis=1)
service_df = service_df.reset_index()
service_df.fillna(0, inplace=True)

In [15]:
non_obscene_category = df.loc[~df["obscene"]].groupby("category")["count"].sum()
obscene_category = df.loc[df["obscene"]].groupby("category")["count"].sum()
category_df = pd.concat([obscene_category.rename("obscene"), non_obscene_category.rename("non-obscene")], join="outer", axis=1)
category_df = category_df.reset_index()
category_df.fillna(0, inplace=True)

In [17]:
service_df["share"] = service_df["obscene"] / (service_df["obscene"] + service_df["non-obscene"])
category_df["share"] = category_df["obscene"] / (category_df["obscene"] + category_df["non-obscene"])
category_df.sort_values("share", ascending=False, inplace=True)
service_df.sort_values("share", ascending=False, inplace=True)

In [156]:
category_df.to_csv("./figures/obsenity_categories.csv")
service_df.to_csv("./figures/obsenity_services.csv")

In [20]:
category_df.head()

Unnamed: 0,category,obscene,non-obscene,share
9,pornography,1637.0,6947,0.190704
13,torrent,10.0,258,0.037313
5,forum,5.0,169,0.028736
6,gaming,1.0,71,0.013889
2,database,5.0,379,0.013021


In [21]:
service_df.head()

Unnamed: 0,service,obscene,non-obscene,share
48,pornez,1.0,0.0,1.0
73,xxxhdvideo,6.0,4.0,0.6
17,bongacams,38.0,32.0,0.542857
72,xvideos2,1.0,1.0,0.5
10,ashemaletube,21.0,22.0,0.488372
