In [None]:
import numpy as np
import matplotlib.pyplot as plt

def df_ify(series, scols):
    sdict = series.to_dict()
    smatrix = np.matrix([sdict.keys(),sdict.values()]).T
    s = pd.DataFrame(data=smatrix)
    s.columns = scols
    return s


# -----------------------------     YAMMER INTERNAL SEARCH     -----------------------------

## DATASETS/SQL SEARCHES

# Who ran a search
#->searchers = datasets["users_who_searched"]
    SELECT distinct(user_id) as searcher
    FROM   tutorial.yammer_events  
    WHERE event_name like '%search_run%'

# Counting up all search like events to get an overall sense
#->search_events = datasets["count_search_events"]  
    SELECT event_name, count(*) as cnt
    FROM   tutorial.yammer_events 
    WHERE event_name like '%search%'
    GROUP BY event_name

# Establishing which clicks belong to which searches
#->df = datasets["clicks_per_search"] 
    SELECT  all_search_clicks.user_id, 
            all_search_clicks.event_name as click, 
            all_search_clicks.occurred_at as clicktime,
            max(all_search_runs.occurred_at) as lastsearchtime
                FROM (SELECT * FROM tutorial.yammer_events AS events1 
                WHERE events1.event_name like '%search_click%') as all_search_clicks
                    JOIN (SELECT * FROM tutorial.yammer_events AS events2 
                                   WHERE events2.event_name = 'search_run') as all_search_runs
                    ON all_search_clicks.user_id = all_search_runs.user_id
                AND all_search_clicks.occurred_at > all_search_runs.occurred_at
                GROUP BY all_search_clicks.user_id, 
                         all_search_clicks.event_name, 
                         all_search_clicks.occurred_at
                ORDER BY all_search_clicks.user_id, 
                         all_search_clicks.occurred_at, 
                         all_search_clicks.event_name


# Gets all searches by user, consumated or not
#->searches_and_users = datasets["searches_and_users"] 
    SELECT  occurred_at as lastsearchtime,
            user_id,
            event_name as run
            FROM tutorial.yammer_events AS events2 
            WHERE events2.event_name = 'search_run'
            order by user_id


In [None]:
#DISCUSSION
#A] Can we assign meaning to how often people use search?
#
#    1) If they hardly use it, it could indicate that it doesn't work well  
#        or that they don't need it because the site is so easy to navigate 
#        or that they don't know it exists 
#        or that they're not aware that it's applicable to their particular needs 
#
#    2) Conversely, if they use it a lot, it could indicate that they like the functonality 
#        or that the site is hard to navigate without it
#
#  More context, such as what people are searching for, is needed for an evaluation
#  of this question


#B] Can we assign meaning to how often people stop searching at the autocomplete phase?
#
#    1) If they do so, it could mean that it's a good shortcut to search 
#        - or that it's a way of browsing
#        - or that it's being used as an ad-hoc spellcheck
#        - or that it falsely gives the impression that a given search item will not be 
#             found, leading people to give up
#        - or that it's laggy, causing the log to record an autocomplete that the user 
#             didn't actually wait for
#
# Again, more context is needed for an evaluation


#C] Can we assign meaning to the number of searches that are consumated in a click?
#
#    1) If a search does not result in a click, it could mean that the user searched for an item
#            that doesn't exist
#        - or that the item does exist and search failed to find it
#        - or that search found it but that it was too deep in the results
#        - or that search found it but that the thumbnails in the results were not clear
#
#    2) If searches do result in clicks, however, it does become possible to examine some 
#       of these questions.  
#        - A single click-through suggests that the item was found
#        - The depth of the click-through is visible, permitting an evaluation of search's
#          ability to bring relevant results to the top
#        - The number of click-throughs per search are visible, permitting an evaluation of
#          quality of thumbnail presentation
#        ... among other observatons that can be made
#
#  Some of these questions can be approached by looking at the relationships between
#  consumated searches and their associated clicks and by looking at the differences
#  between consumated and non-consumated searches


In [None]:
# users_who_searched provides a single column list of users
searchers = datasets["users_who_searched"]
print 'number of users who searched = ', len(searchers)

# search_events counts '%search%' like events. provides two columns [event_name] and [cnt]
search_events = datasets["count_search_events"] 
runs = search_events.loc[search_events['event_name'] == "search_run", 'cnt'].item()
so_clicks = search_events[search_events.event_name.str.contains('click')]
print 'number of searches = ', runs 

# df is a table that shows clicks and their associated searches
#     user_id   click                    click_time             lastsearchtime
#     4         search_click_result_6    2014-05-27 15:10:38    2014-05-27 15:10:06
#     4         search_click_result_10   2014-05-27 15:10:57    2014-05-27 15:10:38
#     4         search_click_result_2    2014-05-27 15:11:28    2014-05-27 15:11:01
# where "lastsearchtime" is the timestamp of the search_run event that anchors
# a given group of click events
df = datasets["clicks_per_search"]  
consumated_searches = df.groupby('lastsearchtime').click.nunique()

print 'number of searches resulting in at least one click = ', len(consumated_searches)
pcent = (float(len(consumated_searches))/float(runs)) * 100
print '     % of searches resulting in at least one click = ', rounc(pcent, 2)
avg   = (float(len(df))/float(len(consumated_searches)))
print '    average number of clicks per consumated search =', round(avg,2)


# turning "consumated_searches" into a DF 
c = df_ify(series=consumated_searches, scols=['numsearches', 'numclicks'])
cg = c.groupby('numclicks').count().reset_index()
cg['numclicks'] = cg['numclicks'].astype('float')

print "distribution of consumated searches by number of clicks:"
print cg.sort_values('numclicks')
print '*  People generally found what they wanted on the first click, ignoring'
print '*  where in the search results that link was found.  This tells us that'
print '*  the search result thumbnails are pretty good, though there\'s room for improvement\n'

users_who_click = df.groupby('user_id').lastsearchtime.nunique()
print ' average number of consumated searches  = ', round(users_who_click.mean(), 2)
print '   fewest consumated searches by a user = ', users_who_click.min()
print '     most consumated searches by a user = ', users_who_click.max()

u = df_ify(series=users_who_click, scols=['num_users', 'num_consumated_searches'])
ug = u.groupby('num_consumated_searches').count().reset_index()
ug['num_consumated_searches'] = ug['num_consumated_searches'].astype('float')
print "distribution of users by number of consumated searches: "
print ug.sort_values('num_consumated_searches')
print '*  Most people clicked through on a small number of searches, if at all '

ratio_c_to_nonc = round(float(len(users_who_click))/float(len(searchers)), 2) * 100
print 'number of users who clicked through on searches =', len(users_who_click), 
      '| %', ratio_c_to_nonc, 'of those who tried'
print "*  Roughly half of all people using the search '
print '   functionalty never clicked through on a search."
print "*  It's not that they stopped at the autocomplete phase.  '
print '   They ran searches but never clicked"
print "*  on any of the results. We don't know if searches that were '
print "   non-consumated were failures of search, "
print '*  or if the users were searching for something that wasn\'t there,'
print '*  but the even split between users who got something and those who got nothing at all'
print '*  suggests that the way the search functonality works may not make sense for a large'
print '*  number of people.  It may be that grouping search results by tabs is confusing'

searches_and_users = datasets["searches_and_users"] # gets all searches by user, consumated or not
merged = pd.merge(searches_and_users, df, left_on='lastsearchtime', right_on='lastsearchtime', how="outer")

# merged looks like this
#   lastsearchtime              run          user_id_x   click      clicktime    user_id_y
#   2014-06-19T15:08:05.000Z    search_run   4           NaN        NaN          NaN
#   2014-05-27T15:10:06.000Z    search_run   4           result_6   2014-05..    4.0

# These are, respectively, users who never succeeded in finding anything through search
# and users who did sometimes find something
null_by_user = merged[merged.user_id_y.isnull()].user_id_x.value_counts()
notnull_by_user = merged[merged.user_id_y.notnull()].user_id_x.value_counts()

null_by_user.sort_index()
notnull_by_user.sort_index()
jj = pd.DataFrame(data=dict(notclicked=null_by_user, clicked=notnull_by_user)).fillna(0)
jj['wholes'] = jj.notclicked + jj.clicked
jj['successes'] = jj.clicked/jj.wholes

savvy = jj[jj.clicked != 0] # drop the ones where the user never succeeded
savvy_avg = round(savvy.successes.mean() * 100, 2)
savvy_avg_attempts = round(savvy.wholes.mean())

notsavvy = jj[jj.clicked == 0] 
notsavvy_avg_attempts = round(notsavvy.wholes.mean())


print "Those who ran searches and clicked through at least once:"
print "  -> clicked through, on average,", savvy_avg,"percent of the time."
print "  -> attempted searches, on average,", savvy_avg_attempts,"times"
print "Those who never clicked through:"
print "  -> attempted searches, on average,", notsavvy_avg_attempts,"times"
print "*  Being able to succeed in search roughly trippled the number of searches people did"

print "#### distribution of depth of all clicks"
vc = df.click.value_counts().to_frame()
vc['depth'] = vc.index.str.extract('(\d+)', expand=False)
vc['depth'] = vc['depth'].astype('float')
print vc.sort_values('depth')

# making a list of people's first clicks
all_first_clicks = df.groupby('lastsearchtime').nth(0)
all_first_clicks['depth'] = all_first_clicks.click.str.extract('(\d+)', expand=False)
all_first_clicks['depth'] = all_first_clicks['depth'].astype('float')

afc = all_first_clicks.groupby('user_id').mean()
afc.reset_index(level=0, inplace=True)
afc=afc.rename(columns = {'depth':'avg_depth'})

# average of the user averages of first click depths
avg_of_avges = str(round(afc.avg_depth.mean(), 2)) 
print "\n### average depth of first click per consumated search = " + avg_of_avges

# rounding averages to whole number to make distribution
afc['avg_depth'] = afc['avg_depth'].astype('float')
afc['avg_depth'] = afc['avg_depth'].round()
avc = afc.avg_depth.value_counts()
print "### distribution of first click depths (not what one would expect)"
print avc

##### quick visual inspection --- checks out
#m = pd.merge(all_first_clicks, afc, left_on='user_id', right_on='user_id', how="outer")
#m.sort_values('user_id')
#####



In [None]:
|
|  number of users who searched = 2178
|  number of searches = 13019
|

|
|  number of searches resulting in at least one click = 3781
|       % of searches resulting in at least one click = 29.0
|      average number of clicks per consumated search = 2.58
|


|
#  distribution of consumated searches by number of clicks:
|
|  numclicks  numsearches
|        1.0         2173
|        2.0          608
|        3.0          292
|        4.0          237
|        5.0          187
|        6.0          130
|        7.0           89
|        8.0           37
|        9.0           19
|        10.0           9
|
*  People generally found what they wanted on the first click, ignoring
*  where in the search results that link was found.  This suggests that
*  the search result thumbnails are ok, though there is room for improvement
|


|
| average number of consumated searches  = 3.24
|   fewest consumated searches by a user = 1
|     most consumated searches by a user = 20
|


|
#  distribution of users by number of consumated searches: 
|
|  num_consumated_searches  num_users
|                      1.0        343
|                      2.0        264
|                      3.0        183
|                      4.0        106
|                      5.0         76
|                      6.0         68
|                      7.0         45
|                      8.0         27
|                      9.0         26
|                      10.0         8
|                      11.0        10
|                      12.0         4
|                      13.0         4
|                      14.0         1
|                      16.0         2
|                      17.0         1
|                      18.0         1
|                      20.0         1
|
*  Most people clicked through on a small number of searches, if at all 
|


|
|  number of users who clicked through on searches = 1170 | % 54.0 of those who tried
|
*  Roughly half of all people using the search functionalty never clicked through on a search.
*  Its not that they stopped at the autocomplete phase.  They ran searches but never clicked
*  on any of the results. We dont know if searches that were non-consumated were failures of 
*  search or if the users were searching for something that wasnt there,
*  but the even split between users who got something and those who got nothing at all
*  suggests that the way the search functonality works may not make sense for a large
*  number of people.  It may be that grouping search results by tabs is confusing.
|


|
|  Those who ran searches and clicked through at least once:
|     -> clicked through, on average, 63.6 percent of the time.
|     -> attempted searches, on average, 13.0 times
|  Those who never clicked through:
|     -> attempted searches, on average, 4.0 times
|
*  Being able to succeed in search roughly trippled the number of searches people did
|


|
#  distribution of depth of all clicks
|
|                         clicks  depth
|  search_click_result_1    1412    1.0
|  search_click_result_2    1496    2.0
|  search_click_result_3    1133    3.0
|  search_click_result_4    1264    4.0
|  search_click_result_5     967    5.0
|  search_click_result_6     805    6.0
|  search_click_result_7     709    7.0
|  search_click_result_8     690    8.0
|  search_click_result_9     784    9.0
|  search_click_result_10    506   10.0
|
*  Broad distribution.  Not necessarily surprising if people search
*  on 'Bob' or 'Alice' and the number of results is large.  More surprising if
*  people search on 'Bob Pendergrass'.  Search contents would inform these
*  numbers
|


|
|  average depth of first click per consumated search = 4.65
|
#  distribution of first click depths 
|
|  4.0     272
|  5.0     230
|  6.0     191
|  2.0     129
|  3.0     127
|  7.0      61
|  1.0      51
|  8.0      48
|  9.0      39
|  10.0     21
|
*  Surprising.  Although by far the majority of searches that were followed
*  by clicks were only followed by one click, that event is likely to be a few
*  pages into the search.  
*  Also, although I am not showing it here, as one looks at the average depth of 
*  the 2nd click, 3rd click and so on, the distribution of pages evens out. The
*  impression is that people are flipping around, unsure where to click.
|
