In [1]:
from functools import partial
from gensim import corpora, models
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import os
import pandas as pd

In [2]:
def load_data(filepath):
    #INPUT: filepath to csv file
    #OUTPUT: returns dataframe of filepath's csv file
    pre_df = pd.read_csv(filepath, header = 1)
    return pre_df

In [4]:
def data_to_path(pre_df, qty):
    #INPUT: pre_df, dataframe
    #INPUT: shortener, quantity to cut data (to run faster)

    # pre_df = pre_df_and_qty[0]
    # qty = pre_df_and_qty[1]

    #Create a numpy array of user journeys
    paths = np.array([ 'Path'])
    for i in range(2, qty):
        #select random row without replacement
        #range starts at row 3 to not include headers
        row_ind = np.random.choice(range(3, len(pre_df)), replace = False)
        #extract path from row
        path = list(str(pre_df.iloc[row_ind, :]).split())[1]
        #add path to paths numpy array
        paths = np.vstack((paths, path))

    for journey in range(len(paths)):
        paths[journey] = paths[journey][0].replace('->', ' ')
    #transpose data so that each journey is no longer a new column
    #after this transpose, each journey is a row
    paths = np.transpose(paths)
    return paths

In [29]:
def paths_to_docs(path):
    #INPUT: path, output of data_to_paths() function
    #OUTPUT: words, a list of documents (list of lists of words)
    words_list = []
    for val in path[0]:
       words_list.append(val.split())
    return words_list[1:]

In [32]:
def doc_combine(words_list):
    #INPUT: list list of words (output of paths_to_docs() function)
    #OUTPUT: list of list of word transitions
    for doc_i, doc in enumerate(words_list):
        #Check to see if document contains more than 1 word
        if len(doc) > 1:
            #if doc contains 2+ words, iterate through all except last word
            for word_i in range(len(doc)-1):
                #convert each word to that word + the next word
                words_list[doc_i][word_i] = str(words_list[doc_i][word_i])+ ' ' + str(words_list[doc_i][word_i + 1])
    return words_list
                

In [5]:
pre_df = load_data('../../data/Top_Traversals_demo-1daybehavior_20140401.csv')
pre_df.head()

  if self.run_code(code, result):


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,2014-04-01
Rank,Sessions,Total Sessions,% of Total Sessions,Paths,% of Total Paths,Path
1,792522,4802851,0.16501074049559314,792522,0.16501074049559314,journey.entry->journey.exit
2,413008,4802851,0.08599225751537992,413008,0.08599225751537992,journey.entry->web.entry->webevent.login->webe...
3,283578,4802851,0.05904368051392808,283578,0.05904368051392808,journey.entry->web.entry->webevent.login->webe...
4,241868,4802851,0.05035925536728081,241868,0.05035925536728081,journey.entry->web.entry->webevent.login->webe...


In [22]:
paths = data_to_path(pre_df, 100)
print paths[0][:10]

['Path' 'journey.entry web.entry webevent.login webe...'
 'journey.entry mobile.entry mobileevent.login...'
 'journey.entry web.entry webevent.login webe...'
 'journey.entry reward web.entry webevent.log...'
 'journey.entry web.entry webevent.view'
 'journey.entry web.entry webevent.login webe...'
 'journey.entry web.entry webevent.login webe...'
 'journey.entry web.entry webevent.login webe...'
 'journey.entry web.entry webevent.view']


In [30]:
docs = paths_to_docs(paths)
print docs

[['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'mobile.entry', 'mobileevent.login...'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'reward', 'web.entry', 'webevent.log...'], ['journey.entry', 'web.entry', 'webevent.view'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'web.entry', 'webevent.view'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'web.entry', 'webstc.already'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'agent.entry', 'agent.exit', 'agent....'], ['journey.entry', 'web.entry', 'webevent.login', 'webe...'], ['journey.entry', 'mobile.entry', 'mobileevent.login...'], [

In [33]:
combined = doc_combine(docs)
print combined

[['journey.entryweb.entry web.entrywebevent.login', 'web.entrywebevent.login webevent.loginwebe...', 'webevent.loginwebe... webe...', 'webe...'], ['journey.entrymobile.entry mobile.entrymobileevent.login...', 'mobile.entrymobileevent.login... mobileevent.login...', 'mobileevent.login...'], ['journey.entryweb.entry web.entrywebevent.login', 'web.entrywebevent.login webevent.loginwebe...', 'webevent.loginwebe... webe...', 'webe...'], ['journey.entryreward rewardweb.entry', 'rewardweb.entry web.entrywebevent.log...', 'web.entrywebevent.log... webevent.log...', 'webevent.log...'], ['journey.entryweb.entry web.entrywebevent.view', 'web.entrywebevent.view webevent.view', 'webevent.view'], ['journey.entryweb.entry web.entrywebevent.login', 'web.entrywebevent.login webevent.loginwebe...', 'webevent.loginwebe... webe...', 'webe...'], ['journey.entryweb.entry web.entrywebevent.login', 'web.entrywebevent.login webevent.loginwebe...', 'webevent.loginwebe... webe...', 'webe...'], ['journey.entryweb

webevent.loginwebe... webe...
