In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
import time
import json

# My implementation of a Hidden Markov Model
from HiddenMarkovModel import HiddenMarkovModel

In [2]:
names = [ "r4.1/r41_features_simple.h5", "r4.1/r41_features_complex.h5", "r4.2/r42_features_simple.h5", "r4.2/r42_features_complex.h5" ]
saves = [ "userScores_simple_r41_80_inertia.json", "userScores_complex_r41_80_inertia.json", "userScores_simple_r42_80_inertia.json", "userScores_complex_r42_80_inertia.json"]
assert(len(names) == len(saves))
for name,save in zip(names,saves):
    
    filename = "/home/tabz/Documents/CMU_Dataset/" + name
    
    print("Loading:", filename)
    joint = pd.read_hdf(filename, "table")
    users = np.unique(joint.index.values)
    print("There are", users.size, "users")
    num_features = np.unique(joint["feature"].values).size
    print("Using", num_features, "features")
    
    -
    
    def compute_probs(user_df):
        dayGrouping = pd.Grouper(key="date", freq="1D")
        weekGrouping = pd.Grouper(key="date", freq="1W")
        timeGrouping = user_df.groupby(weekGrouping)

        # print("Starting on user: ", user)
        s,t,e = init_matrices()
        model = HiddenMarkovModel.HMM(num_states, t,e,s)

        trainingPeriod = 4
        timesTrained = 0

        logProbScores = []

        for name, group in timeGrouping:

            #The sequence for the time grouping we are considering
            seq = group["feature"].values

            if len(seq) < 1:
                # If there is no activity for this week 
                logProbScores.append(0)
                continue

            if timesTrained > trainingPeriod:
                logProb = model.sequence_log_probability(seq)
                logProbScores.append(-logProb)

                #Train the model on the sequence we have just seen
            model.learn(seq, max_iters=20, threshold=0.01, restart_threshold=0.1,max_restarts=5, inertia=0.8)
            timesTrained+=1

        return logProbScores

    userScores = {}

    def setInDict(u):
        def z(r):
            userScores[u] = r
        return z

    pool = mp.Pool(processes=8)
    print("Queueing up jobs", flush=True)
    for user in tqdm(users):
        pool.apply_async(compute_probs, args=(joint.loc[ joint.index == user ],), callback=setInDict(user))
    #     compute_probs(joint.loc[user])
    print("Progress on those jobs:", flush=True)    
    done = 0
    for i in tqdm(users):
        while done >= len(userScores):
            time.sleep(3)
        done += 1
    pool.close()
    pool.join()
    
    print("Saving:", save)
    json.dump(userScores, open(save, "w"))

Loading: /home/tabz/Documents/CMU_Dataset/r4.1/r41_features_simple.h5
There are 1000 users
Using 7 features
Queueing up jobs


100%|██████████| 1000/1000 [22:34<00:00,  1.52s/it]

Progress on those jobs:



100%|██████████| 1000/1000 [1:19:10<00:00,  4.55s/it]


Saving: userScores_simple_r41_80_inertia.json
Loading: /home/tabz/Documents/CMU_Dataset/r4.1/r41_features_complex.h5
There are 1000 users
Using 16 features
Queueing up jobs


100%|██████████| 1000/1000 [22:36<00:00,  1.25s/it]

Progress on those jobs:



100%|██████████| 1000/1000 [1:29:02<00:00,  5.34s/it]


Saving: userScores_complex_r41_80_inertia.json
Loading: /home/tabz/Documents/CMU_Dataset/r4.2/r42_features_simple.h5
There are 1000 users
Using 7 features
Queueing up jobs


100%|██████████| 1000/1000 [21:44<00:00,  1.26s/it]

Progress on those jobs:



100%|██████████| 1000/1000 [1:15:43<00:00,  3.55s/it]


Saving: userScores_simple_r42_80_inertia.json
Loading: /home/tabz/Documents/CMU_Dataset/r4.2/r42_features_complex.h5
There are 1000 users
Using 16 features
Queueing up jobs


100%|██████████| 1000/1000 [21:42<00:00,  1.33s/it]

Progress on those jobs:



100%|██████████| 1000/1000 [1:27:29<00:00,  3.66s/it]


Saving: userScores_complex_r42_80_inertia.json
