In [1]:
from pyspark.context import SparkContext
from pyspark.sql.functions import concat_ws, collect_list, countDistinct, count, col, expr, size, asc, broadcast
from pyspark.ml.feature import StringIndexer

In [2]:
print("Load the data")
users_queries_search_main_df = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .csv("user-ct-test-collection-01.txt")

Load the data


In [3]:
print("Define global variables")
n = 2 # num of rows to display
min_num_of_queries = 2
min_num_of_queries_pair = 1
empty_queries = ['-', 'null']
confidences = [0.6, 0.8, 0.9, 1]
top_num_of_results = 10

Define global variables


In [4]:
users_queries_search_main_df.show(n, truncate=False)
total_num_of_rows = users_queries_search_main_df.count()
print("total number of rows, before filtering is: " + repr(total_num_of_rows))

+------+---------------------------+-------------------+--------+--------+
|AnonID|Query                      |QueryTime          |ItemRank|ClickURL|
+------+---------------------------+-------------------+--------+--------+
|142   |rentdirect.com             |2006-03-01 07:17:12|null    |null    |
|142   |www.prescriptionfortime.com|2006-03-12 12:31:06|null    |null    |
+------+---------------------------+-------------------+--------+--------+
only showing top 2 rows

total number of rows, before filtering is: 3558411


In [5]:
users_queries_search_filtered = users_queries_search_main_df.select('AnonID', 'Query').drop_duplicates(subset=['AnonID', 'Query']).filter(col('Query').isin(empty_queries) == False)

In [6]:
users_queries_search_filtered.show(n, truncate=False)

+------+----------------------------+
|AnonID|Query                       |
+------+----------------------------+
|1337  |michael keaton date of birth|
|2334  |disneychanne.com            |
+------+----------------------------+
only showing top 2 rows



In [7]:
num_of_rows = users_queries_search_filtered.count()
print("number of rows, after filtering is: " + repr(num_of_rows))

number of rows, after filtering is: 1659839


In [8]:
queries_count_df = users_queries_search_filtered.groupBy('Query').agg(count("*").alias("count")).filter("count > " + repr(min_num_of_queries))

In [None]:
queries_count_df.show(n, truncate=False)

+------------------+-----+
|Query             |count|
+------------------+-----+
|game house        |3    |
|www.capitalone.com|112  |
+------------------+-----+
only showing top 2 rows



In [None]:
queries_dict_df = queries_count_df.rdd.zipWithIndex().map(lambda line: (line[0][0], line[1])).toDF().select(col('_1').alias('Query'), col('_2').alias('index'))

In [None]:
queries_dict_df.show(n, truncate=False)

In [None]:
num_of_queries = queries_dict_df.count()
print("number of queries, after filtering is: " + repr(num_of_queries))

In [None]:
users_queries_indexed_df = users_queries_search_filtered.join(queries_dict_df, queries_dict_df.Query == users_queries_search_filtered.Query).select('AnonID', 'index')
users_queries_indexed_df = users_queries_indexed_df.orderBy("AnonID", ascending=True)
users_queries_indexed_df.show(20, truncate=False)

In [None]:
users_queries_indexed_df2 = users_queries_indexed_df.select(col('AnonID').alias('AnonID2'), col('index').alias('index2'))

In [None]:
users_queries_self_joined_df = users_queries_indexed_df.join(users_queries_indexed_df2, on=[col('AnonID') == col('AnonID2'), col('index') > col('index2')], how='cross')

In [None]:
users_queries_self_joined_df = users_queries_self_joined_df.select(col('AnonID').alias('user'), col('index').alias('query1'), col('index2').alias('query2'))
users_queries_self_joined_df.show(20, truncate=False)

In [None]:
num_of_pairwise_queries = users_queries_self_joined_df.count()
print("number of pairwise queries is: " + repr(num_of_pairwise_queries))

In [None]:
queries_pair_count_df = users_queries_self_joined_df.groupBy('query1', 'query2').agg(count("*").alias("count_2_queries")).filter("count_2_queries > " + repr(min_num_of_queries_pair))

In [None]:
queries_pair_count_df.show(20, truncate=False)

In [None]:
queries_count_indexed_df =  queries_count_df.join(queries_dict_df, queries_dict_df.Query == queries_count_df.Query, how='full').select('index', 'count')

In [None]:
queries_count_indexed_df.show(20, truncate=False)

In [None]:
#result_df = queries_pair_count_df.join(queries_count_indexed_df, on=[queries_pair_count_df['query1'].isNotNull, queries_pair_count_df['query2'].isNotNull, col('query1') == col('index')], how='full')
#result_df = queries_pair_count_df.join(queries_count_indexed_df, on=[col('query1') == col('index')], how='full')

#result_df = queries_pair_count_df.join(queries_count_indexed_df, on=[col('query1') == col('index')], how='full').filter("col('query1').isNotNull and col('query2').isNotNull")
result_df = queries_pair_count_df.join(queries_count_indexed_df, on=[col('query1') == col('index')], how='inner')

In [None]:
result_df.show(20, truncate=False)

In [None]:
num_of_results = result_df.count()
print("number of results is: " + repr(num_of_results))

In [None]:
final_result_df = result_df.filter('count_2_queries / count > 0.6')

In [None]:
final_result_df.show(20, truncate=False)

In [None]:
results_df = []
results_count = []
for i in range(len(confidences) - 1):
    conf = confidences[i]
    next_conf = confidences[i + 1]
    current_result_df = result_df.filter('count_2_queries / count > ' + repr(conf) + ' and count_2_queries / count <= ' + repr(next_conf))
    current_result_df.orderBy(['count_2_queries', 'count'], ascending=False)
    results_df.append(current_result_df)
    confidence_count = current_result_df.count()
    results_count.append(confidence_count)
    print(current_result_df.head(20))
    #print(current_result_df.takeOrdered(top_num_of_results, lambda line: -line[2] / line[3]))
    print("\nNumber of rules with confidence between " + repr(conf) + " and " + repr(next_conf) + " is: " + repr(confidence_count) + "\n\n")

In [None]:
joined_results_df = []
results_dict_list = []
for i in range(len(confidences) - 1):
    current_result_df = results_df[i]
    current_result_joined1_df = queries_dict_df.join(broadcast(current_result_df),queries_dict_df.index == current_result_df.query1)
    current_result_joined2_df = queries_dict_df.join(broadcast(current_result_df),queries_dict_df.index == current_result_df.query2)
    current_result_joined_df = current_result_joined1_df.join(current_result_joined2_df, (current_result_joined1_df.query1 == current_result_joined2_df.query1) & (current_result_joined1_df.query2 == current_result_joined2_df.query2), how='inner')
    joined_results_df.append(current_result_joined_df)
    results_dict_list.append(current_result_joined_df.rdd.map(lambda line: (line[0], line[7])).filter(lambda line: line[0] != line[1]).collectAsMap())

In [None]:
for i in range(len(confidences) - 1):
    conf = confidences[i]
    next_conf = confidences[i + 1]
    str_rule = 'rule'
    if(results_count[i] != 1):
        str_rule += 's'
    print("\n" + repr(results_count[i]) + " " + str_rule + " with confidence between " + repr(conf) + " and " + repr(next_conf) + ":\n\n")
    for k, v in results_dict_list[i].items():
        print(k + " ==> " + v + "\n")