In [1]:
from pyspark.context import SparkContext
from pyspark.sql.functions import count, col, length, asc, udf, lit, floor, pow
from pyspark.sql.types import IntegerType, StringType, FloatType, BooleanType
from pyspark.sql import Row
import math
from itertools import combinations
from operator import add

In [2]:
print("Load the data")
users_queries_search_main_df = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .csv("user-ct-test-collection-01.txt")

Load the data


In [3]:
print("Define global variables")
n1 = 2 # min num of rows to display
n2 = 20 # max num of rows to display
confidences = [0.6, 0.8, 0.9, 1]
min_num_of_chars_in_query = 2
min_num_of_queries = 6
min_num_of_queries_pair =  math.floor(min_num_of_queries * confidences[0])
levenshtein_distance_threshhold = 10
display_rules_num_of_records_threshhold = 200
stop_websites = 'google|gmail'
stop_queries = ['...', 'null', 'http', 'http;', 'htp', 'thttp', 'ww', 'www', 'www.', 'com', '.com', 'goole', 'goog', 'googl', 'gm..', 'g mail.com', 'g mail']

Define global variables


In [4]:
users_queries_df = users_queries_search_main_df.select('AnonID', 'Query')\
                    .drop_duplicates(subset=['AnonID', 'Query'])\
                    .filter((col('Query').rlike(stop_websites) == False) & (length(col("Query")) >= min_num_of_chars_in_query))\
                    .select(col('AnonID').alias('user'), col('Query').alias('query'))

stop_queries_df = spark.createDataFrame(stop_queries, StringType()).toDF("query")
unwanted_queries_df = users_queries_df.join(stop_queries_df, on='query' , how = 'inner').select('user', 'query')
users_queries_df = users_queries_df.subtract(unwanted_queries_df)

print("repartition users_queries_df by user column")
users_queries_df.repartition('user')

queries_count_df = users_queries_df.groupBy('query').agg(count("*").alias("count_query"))\
                    .filter("count_query >= " + repr(min_num_of_queries))

print("repartition queries_count_df by query column")
queries_count_df.repartition('query')

users_queries_count_df = users_queries_df.join(queries_count_df, on='query', how='inner')
num_of_users_queries_count = users_queries_count_df.count()
print("number of users queries count is: " + repr(num_of_users_queries_count))
users_queries_count_df.show(n1, truncate=False)

repartition users_queries_df by user column
repartition queries_count_df by query column
number of users queries count is: 309541
+-----+-------+-----------+
|query|user   |count_query|
+-----+-------+-----------+
|ako  |2706422|41         |
|ako  |9640439|41         |
+-----+-------+-----------+
only showing top 2 rows



In [5]:
users_pair_queries_count_rdd = users_queries_count_df.rdd.map(lambda line: (line[1], [line[0]]))\
                        .reduceByKey(add)\
                        .map(lambda line: tuple(combinations(line[1], 2)))\
                        .flatMap(lambda line: [(x, 1) for x in line])\
                        .reduceByKey(add)\
                        .filter(lambda line: line[1] >= min_num_of_queries_pair)

num_of_queries_pairs = users_pair_queries_count_rdd.count()
print('num of pair count queries = ' + repr(num_of_queries_pairs))
print(users_pair_queries_count_rdd.take(n2))

num of pair count queries = 69882
[(('love quotes', 'weather'), 3), (('yahoo games', 'old navy'), 3), (('freecycle', 'mapquest'), 3), (('msnbc.com', 'dogpile'), 3), (('lowes', 'air tran'), 3), (('mohegan sun', 'bankofamerica'), 3), (('remax', 'american express'), 4), (('macys', 'geico'), 3), (('dodge', 'ebay'), 7), (('newsday', 'qvc'), 3), (('target', 'nelly'), 3), (('bikes', 'ebay'), 4), (('www yahoo.com', 'www.my space.com'), 4), (('yahoo.com', 'www.vh1.com'), 7), (('nba.com', 'nba'), 7), (('msn.com', 'sbcglobal.net'), 4), (('avon', 'sears'), 7), (('ebay', 'ellen degeneres'), 3), (('dmv', 'kohls'), 3), (('food network', 'cingular wireless'), 4)]


In [6]:
queries_count_filtered_df = users_queries_count_df.select('query', 'count_query')\
                    .drop_duplicates(subset=['query', 'count_query'])\

num_of_queries_count_filtered = queries_count_filtered_df.count()
print("number of queries count filtered is: " + repr(num_of_queries_count_filtered))
queries_count_filtered_df.show(n1, truncate=False)

number of queries count filtered is: 15391
+-----+-----------+
|query|count_query|
+-----+-----------+
|ako  |41         |
|anime|26         |
+-----+-----------+
only showing top 2 rows



In [7]:
users_pair_queries_count_df = sqlContext.createDataFrame(users_pair_queries_count_rdd.map(lambda line: Row(query=line[0][0], query2=line[0][1], count_2_queries=line[1])))
num_of_pair_queries_count = users_pair_queries_count_df.count()
print('num of pair queries results = ' + repr(num_of_pair_queries_count))

num of pair queries results = 69882


In [8]:
print("Inner join query count to results data frame")
results_df = users_pair_queries_count_df.join(queries_count_filtered_df, on = 'query', how = 'inner')\
                                        .filter('count_2_queries / count_query >= ' + repr(confidences[0]))\
                                        .select('query', 'query2', 'count_2_queries', 'count_query')
num_of_results = results_df.count()
print('num of results = ' + repr(num_of_results))
results_df.show(n1, truncate=False)

Inner join query count to results data frame
num of results = 224
+-----------------+-------------+---------------+-----------+
|query            |query2       |count_2_queries|count_query|
+-----------------+-------------+---------------+-----------+
|craigslist boston|craigslist   |5              |7          |
|www.yaho.com     |www.yahoo.com|5              |7          |
+-----------------+-------------+---------------+-----------+
only showing top 2 rows



In [9]:
def levenshtein(s, t):
        ''' From Wikipedia article; Iterative with two matrix rows. '''
        if s == t: return 0
        elif len(s) == 0: return len(t)
        elif len(t) == 0: return len(s)
        v0 = [None] * (len(t) + 1)
        v1 = [None] * (len(t) + 1)
        for i in range(len(v0)):
            v0[i] = i
        for i in range(len(s)):
            v1[0] = i + 1
            for j in range(len(t)):
                cost = 0 if s[i] == t[j] else 1
                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
            for j in range(len(v0)):
                v0[j] = v1[j]
                
        return v1[len(t)]
    
def levenshtein_threshhold(distance_threshhold, confidence):
    return distance_threshhold * (1- pow(confidence, 2))

def conf(count_2_queries, count_query):
    return count_2_queries / count_query

def conf_level(conf):
    factor = math.floor(conf * 10)
    l = len(confidences)
    conf_level = 0
    i = 0
    while(i < l and conf >= confidences[i]):
        conf_level += math.floor(factor / math.floor(confidences[i] * 10))
        i += 1
    return conf_level

In [10]:
print("Add confidence column to result data frame")
func_conf_udf = udf(conf, FloatType())
results_df = results_df.withColumn('conf',func_conf_udf(results_df['count_2_queries'], results_df['count_query']))

print("Add confidence_level column to result data frame")
func_conf_level_udf = udf(conf_level, IntegerType())
results_df = results_df.withColumn('conf_level',func_conf_level_udf(results_df['conf']))

print("Add similarity column to result data frame")
func_levenshtein_udf = udf(levenshtein, IntegerType())
results_df = results_df.withColumn('similarity',func_levenshtein_udf(results_df['query'], results_df['query2']))

print("Filter queries with high similarity, using levenshtein algorithm")
results_df = results_df.filter(results_df['similarity'] >= levenshtein_threshhold(levenshtein_distance_threshhold, lit(results_df['conf'])))

print("Sort results")
results_df = results_df.orderBy(['conf_level', 'conf', 'similarity', 'count_2_queries', 'count_query'], ascending=False)

print("repartition results_df by conf_level column")
results_df.repartition('conf_level')

print("Count number of results")
num_of_results = results_df.count()

Add confidence column to result data frame
Add confidence_level column to result data frame
Add similarity column to result data frame
Filter queries with high similarity, using levenshtein algorithm
Sort results
repartition results_df by conf_level column
Count number of results


In [11]:
folder_name = 'related_searches'
print('Save results data frame as ' + folder_name)
results_df.coalesce(1).write\
            .partitionBy('conf_level')\
            .format("com.databricks.spark.csv")\
            .option("header", "true")\
            .mode("overwrite")\
            .save(folder_name)

Save results data frame as related_searches


In [12]:
# Display the dash character ('-') for a given length times
def displayMultipleCharacters(character, title, length):
    print(character * length)

# Display title, with dash decoration
def displayTitle(title):
    length = len(title)
    print("\n")
    displayMultipleCharacters('=', title, length)
    print(title)
    displayMultipleCharacters('=', title, length)
    print("\n")
    
# Display title, with dash decoration
def displaySubTitle(title):
    length = len(title)
    print(title)
    displayMultipleCharacters('-', title, length)

def display_results_list(results_df):
    result_list = [list(row) for row in results_df.collect()]
    previous_conf_level = 4
    count_current_conf_results = 0

    first_confidence = result_list[0][5] 
    if(first_confidence < 1):
        title = 'rules with confidence between ' + repr(first_confidence) + ' and 1'
    else:
        title = 'rules with confidence 1'
        
    displayTitle(title)
    
    for i in range(num_of_results):
        item = result_list[i]
        current_conf_level = item[5]
        if i > 0:
            previous_conf_level = result_list[i-1][5]

        count_current_conf_results += 1

        if previous_conf_level != current_conf_level:
            if i > 0:
                displaySubTitle('total: ' + repr(count_current_conf_results - 1) + ' rules')

            count_current_conf_results = 1
            displayTitle("rules with confidence between " +  repr(confidences[current_conf_level - 1]) + " and " + repr(confidences[current_conf_level]))

        print('{index:d}) {q1} ==> {q2}, conf={confidence:.3f}, #q1={q1_count}, #(q1 and q2)={combined_count}, similarity={similarity}'.format(index = count_current_conf_results, q1 = item[0], q2 = item[1], confidence = item[4],  q1_count = item[3], combined_count = item[2], similarity = item[6]))
        
        if i < num_of_results - 1:
            print('\n')

    if i == num_of_results - 1:
        displaySubTitle('\n\ntotal: ' + repr(count_current_conf_results) + ' rules')

if num_of_results < display_rules_num_of_records_threshhold:
    display_results_list(results_df)
else:
    results_df.show(n2, truncate=False)



rules with confidence 1


1) diconary ==> dictionary, conf=1.000, #q1=7, #(q1 and q2)=7, similarity=2


2) mapuest ==> mapquest, conf=1.000, #q1=14, #(q1 and q2)=14, similarity=1


3) mapquet ==> mapquest, conf=1.000, #q1=11, #(q1 and q2)=11, similarity=1


4) www.friendspayday.com ==> www.friendspaydy.com, conf=1.000, #q1=7, #(q1 and q2)=7, similarity=1


5) localhookup.com ==> localhookupz.com, conf=1.000, #q1=6, #(q1 and q2)=6, similarity=1


total: 5 rules
--------------


rules with confidence between 0.9 and 1


1) yhaoo ==> yahoo, conf=0.900, #q1=10, #(q1 and q2)=9, similarity=2


total: 1 rules
--------------


rules with confidence between 0.8 and 0.9


1) aol screen names ==> screen names, conf=0.875, #q1=8, #(q1 and q2)=7, similarity=4


2) www.sprint.com mynextel ==> mapquest, conf=0.857, #q1=7, #(q1 and q2)=6, similarity=20


3) letssingit ==> mycl.cravelyrics.com, conf=0.857, #q1=7, #(q1 and q2)=6, similarity=17


4) mysce.com ==> myspace, conf=0.857, #q1=7, #(q1 and q2

In [15]:
print("Free memory")
results_df.unpersist()

Free memory


DataFrame[query: string, query2: string, count_2_queries: bigint, count_query: bigint, conf: float, conf_level: int, similarity: int]