# Find pornographic queries in non-pornographic services
This notebook uses the metric nTIC proposed by Suronov et al. (http://www.isa.ru/arxiv/2013/2013_SPECOM.pdf) to evaluate whether a query is likely to be pornographic

## Preparation

In [54]:
# TODO: Find different source for non-sexual queries. Using the queries from non-pornographic services likely leads to bias as they contain porn queries themselves

from os import environ
import pandas as pd
import math
from pathlib import Path
from json import loads
import numpy as np

In [4]:
from pyspark.sql import SparkSession

environ[
    'PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("pornographic_queries") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()



In [5]:
sc = session.sparkContext
sc

In [6]:
# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus")
# queries_dir = corpus_dir / "queries-2023-02-14"
corpus_dir = Path(
    "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus")
queries_dir = corpus_dir / "queries-2023-02-16"
queries_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus/queries-2023-02-16')

In [7]:
figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

## Load and prepare queries

In [8]:
def adapt_query(query: dict) -> tuple:
    query_text = query["url_query"].lower()
    query_text = query_text.replace('+', ' ').replace('-', ' ').replace('|', ' ').replace('_', ' ')
    query_text = query_text.replace('[^\w\s]','')
    return query["service"],   query["language"], query_text

In [9]:
# Get all queries to construct a  set of query terms
queries = sc.textFile(f"file://{queries_dir}") \
    .map(lambda line: loads(line)) \
    .filter(lambda query: query["url_query"] is not None) \
    .keyBy(lambda query: query["id"]) \
    .mapValues(adapt_query) \
    .countByValue()

In [10]:
df = pd.DataFrame([
    {
        "service": service,
        "language": language,
        "query": query,
    }
    for (_, (service, language, query)), _ in queries.items()
])

In [12]:
import yaml
yaml_path = "../data/selected-services.yaml"
with open(yaml_path, "r") as file:
    yaml_list = yaml.safe_load(file)
    
porn_services = [elem["name"] for elem in yaml_list if elem["category"] == "pornography"]

In [67]:
# Creating a dataframes of queries from pornographic/other services
df_porn = df.loc[df["service"].isin(porn_services)]
df_porn = df_porn.loc[df_porn["service"] != "livejasmin"]
df_porn = df_porn.loc[df_porn["query"] != ""]

df_non_porn = df.loc[~df["service"].isin(porn_services)]
df_non_porn = df_non_porn.loc[df_non_porn["query"] != ""]

## Calculate TIC values for queries from non-pornographic services
- Generate a set of terms in all queries
- Calculate IDF-values for each term and a corpus of (non-)pornographic queries
- Caclulate nTIC values for all non-pornographic queries (Higher values indicates higher similarity to pornographic queries)

### Set of terms

In [14]:
def get_term_set(df: pd.DataFrame):
    query_lists = [str(query).split() for query in list(df["query"])]
    query_terms = [term for query_list in query_lists for term in query_list]
    return set(query_terms)

In [68]:
def count_queries_with_term(query_lst, term):
    count = 0
    for query in query_lst:
        if term in query:
            count += 1
    return count

# Get all queries from both df's
query_lst_porn = list(df_porn["query"])
query_lst_non_porn = list(df_non_porn["query"])

# Get all unique terms used in both dataframes
term_set = get_term_set(df_porn).union(get_term_set(df_non_porn))
term_rdd = sc.parallelize(list(term_set))

### IDF values

In [69]:
# For each term, count how often it occurs in the two query lists
counts_rdd_porn = term_rdd.map(lambda term: (term, count_queries_with_term(query_lst_porn, term), len(query_lst_porn)))
counts_rdd_non_porn = term_rdd.map(lambda term: (term, count_queries_with_term(query_lst_non_porn, term), len(query_lst_porn)))

In [70]:
# Calculate the IDF values based on the count results
idf_porn_rdd = counts_rdd_porn.map(lambda x: (x[0], math.log(x[2]/(x[1]+1), x[2])))
idf_non_porn_rdd = counts_rdd_non_porn.map(lambda x: (x[0], math.log(x[2]/(x[1]+1), x[2])))

In [71]:
idf_porn = {x[0]:x[1] for x in idf_porn_rdd.collect()}
idf_non_porn = {x[0]:x[1] for x in idf_non_porn_rdd.collect()}

### nTIC values

In [60]:
def ltf(term: str, query: str):
    # Create a space split array and a weighting array
    query_arr = np.array(str(query).split())
    weight_arr = np.array([1/np.log(x+2) for x in range(query_arr.shape[0])])
    # Get positional weights for each occurrence of the term in the query
    matched = np.where(query_arr==term, weight_arr, np.zeros_like(weight_arr))
    # Return lTF value
    return math.log(1 + np.sum(matched), 1 + np.sum(weight_arr))

def ltf_idf(term: str, query: str, idf_dict: dict):
    return ltf(term=term, query=query)*idf_dict[term]

def term_TIC(term: str, query: str, idf_porn: dict, idf_non_porn: dict):
    delta_info = idf_non_porn[term] - idf_porn[term]
    delta_info_pos = np.heaviside(delta_info, 0) * delta_info
    return ltf(term=term, query=query)*delta_info_pos

def nTIC(query: str, idf_porn: dict, idf_non_porn: dict):
    tic = 0
    query_info = 0
    for term in str(query).split():
        tic += term_TIC(term=term, query=query, idf_porn=idf_porn, idf_non_porn=idf_non_porn)
        query_info += ltf_idf(term=term, query=query, idf_dict=idf_non_porn)
    try:
        nTIC = tic/query_info
    except:
        nTIC = 0
    return nTIC

In [94]:
non_porn_rdd = sc.parallelize(df_non_porn.to_dict("records"))
nTIC_rdd = non_porn_rdd.map(lambda row: {**row, **{"nTIC": nTIC(row["query"], idf_porn, idf_non_porn)}})

In [101]:
df_ntic = pd.DataFrame(nTIC_rdd.collect())

In [107]:
df_ntic.sort_values("nTIC", ascending=False, inplace=True)

## Analysis

In [109]:
rel = df_ntic.loc[df_ntic["nTIC"]>0.07]

In [112]:
df_ntic.loc[df_ntic["query"].str.contains("sex")]

Unnamed: 0,service,language,query,nTIC
148070,yahoo,fr,amateur sex porn,0.273191
77994,google,,"hardcore animal sex,dog porn",0.101485
103272,bing,en,"bisexual, gay, and lesbian association",0.070101
59333,yahoo,,asian phone sex,0.059598
59338,yahoo,en,asian anal sex gallery,0.045571
...,...,...,...,...
148384,wikimedia,en,sexuality,0.000000
148444,all-free-download,,sexy,0.000000
150232,bing,en,only sexi grannies in gonjog,0.000000
151249,yahoo,lb,women sex,0.000000
