### Imports

In [1]:
import collections
import cPickle as pickle
import itertools
import gc
import json
from matplotlib import pyplot as plt
import numpy as np
import os
import pixie
import random
import snap
import time
import tools

### Preprocess

Pixie does not work well if we do not bias the walk. In order to do so we need edges to be weighted, and the graph is simply too large to fit into RAM if we were to incorporate all the features.

Instead, we will prune out edges with weight 1. Experiments show that this has minimal sacrifice in metrics, and we will hopefully be able to improve results by incorporating more features into the weights.

#### Ids to Index

We start by generating two lists to map ids to indexes, and vice versa.

In [2]:
# Get the set of necessary users and subreddits.
users = set()
subreddits = set()
with open("../bigData/finalGeneration/inputGraph", 'r') as f:
    for i, line in enumerate(f, 1):
        userId, subredditId, commentCountStr = line.split()
        commentCount = int(commentCountStr)
        if commentCount > 0:
            users.add(userId)
            subreddits.add(subredditId)

        if i % 1000000 == 0:
            print "Processed {}".format(i)
print "Users: {}, Subreddits: {}".format(len(users), len(subreddits))

Processed 1000000
Processed 2000000
Processed 3000000
Processed 4000000
Processed 5000000
Processed 6000000
Processed 7000000
Processed 8000000
Processed 9000000
Processed 10000000
Processed 11000000
Processed 12000000
Processed 13000000
Processed 14000000
Processed 15000000
Processed 16000000
Users: 6283057, Subreddits: 152976


In [3]:
# Save the mappings.
with open("../bigData/pixie/indexToUserId", 'w') as outfile:
    for userId in users:
        outfile.write(userId + "\n")
with open("../bigData/pixie/indexToSubredditId", 'w') as outfile:
    for subredditId in subreddits:
        outfile.write(subredditId + "\n")

#### Features

Here, we generate a list of lists for each feature, numbered as follows:
1. The index itself
2. The comment count

In [3]:
# Load the two mappings
indexToUserId, userIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToUserId")
indexToSubredditId, subredditIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToSubredditId")

In [4]:
# Calculate total counts for each index.
userIndexToTotalComments = [0] * len(indexToUserId)
# subredditIndexToTotalComments = [0] * len(indexToSubredditId)
with open("../bigData/finalGeneration/submissionsGraph", 'r') as f:
    for line in f:
        userId, subredditId, commentCountStr = line.split()
        if userId not in userIdToIndex or subredditId not in subredditIdToIndex:
            continue
        commentCount = float(commentCountStr)
        
        userIndex = userIdToIndex[userId]
        subredditIndex = subredditIdToIndex[subredditId]
        userIndexToTotalComments[userIndex] += np.sqrt(commentCount)
        # subredditIndexToTotalComments[subredditIndex] += commentCount
        
print "Done"

Done


In [5]:
# Make the list of weights.
userIndexToSubreddits = [[] for i in range(len(indexToUserId))]
subredditIndexToUsers = [[] for i in range(len(indexToSubredditId))]
with open("../bigData/finalGeneration/submissionsGraph", 'r') as f:
    for i, line in enumerate(f, 1):
        userId, subredditId, commentCountStr = line.split()
        if userId not in userIdToIndex or subredditId not in subredditIdToIndex:
            continue
        commentCount = float(commentCountStr)
        
        userIndex = userIdToIndex[userId]
        subredditIndex = subredditIdToIndex[subredditId]
        userTotalComments = float(userIndexToTotalComments[userIndex])
        
        userIndexToSubreddits[userIndex].append((np.sqrt(commentCount) / userTotalComments, subredditIndex))
        # subredditIndexToUsers[subredditIndex].append((commentCount, userIndex))
        subredditIndexToUsers[subredditIndex].append(userIndex)
        if i % 10000000 == 0:
            print "Processed {}".format(i)
print "Users: {}, Subreddits: {}".format(len(userIndexToSubreddits), len(subredditIndexToUsers))

# Sort.
for subreddits in userIndexToSubreddits:
    subreddits.sort(reverse=True)
"""
for subredditIndex, users in enumerate(subredditIndexToUsers):
    users.sort(reverse=True)
    users[:] = users[:100]
    subredditTotalComments = float(reduce(lambda total, nextPair: total + nextPair[0], users, 0))
    for i in range(len(users)):
        users[i] = (users[i][0] / subredditTotalComments, users[i][1])
"""
print "Done Sorting"

Processed 10000000
Users: 6283057, Subreddits: 152976
Done Sorting


In [10]:
# Output.
with open("../bigData/pixie/userIndexToSubreddits", 'w') as outfile:
    for subreddits in userIndexToSubreddits:
        line = ""
        for weight, subredditIndex in subreddits:
            line += " {0:.9f} {1}".format(weight, subredditIndex)
        outfile.write(line + "\n")
print "Done"

Done


In [15]:
with open("../bigData/pixie/subredditIndexToUsers", 'w') as outfile:
    for users in subredditIndexToUsers:
        line = ""
        for weight, userIndex in users:
            line += " {0:.9f} {1}".format(weight, userIndex)
        outfile.write(line + "\n")
print "Done"

Done


#### Convert to weighted input graph

It's too slow if we have to walk through the entire list during the random walk, so we convert to a more efficient representation which will take more RAM.

### Pixie

In [6]:
# Load everything.
print "Loading user id indexes"
indexToUserId, userIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToUserId")
print "Loading subreddit id indexes"
indexToSubredditId, subredditIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToSubredditId")
print "Loading user id to old subreddits"
userIdToOldSubreddits = tools.getUserIdToSubredditsByType("../bigData/finalGeneration/expUsers", "oldSubreddits")
print "Loading user id to new subreddits"
userIdToNewSubreddits = tools.getUserIdToSubredditsByType("../bigData/finalGeneration/expUsers", "newSubreddits")
print "Loading subreddit id to name"
subredditIdToName = tools.read_subreddit_names("../bigData/subredditIdToName")

Loading user id indexes
Loading subreddit id indexes
Loading user id to old subreddits
Loading user id to new subreddits
Loading subreddit id to name


In [None]:
print "Loading user index to subreddits"
userIndexToSubreddits = None
with open("../bigData/pixie/userIndexToSubreddits.pkl", 'r') as infile:
    userIndexToSubreddits = pickle.load(infile)
print "Loading subreddit index to user"
subredditIndexToUsers = None
with open("../bigData/pixie/subredditIndexToUsers.pkl", 'r') as infile:
    subredditIndexToUsers = pickle.load(infile)
print "Done"

In [17]:
# Reload module and set seeds.
reload(pixie)
random.seed(7224)
np.random.seed(7224)

# Parameters.
N = 100000
alpha = 0.5

# Get recs.
precision10 = 0
mrrMetric = 0
mapMetric = 0
numUsers = 0
for i, (userId, oldSubreddits) in enumerate(userIdToOldSubreddits.iteritems()):
    if userId not in userIdToIndex:
        continue
    
    print "Processing {} {}".format(i, userId)
    oldSubreddits = {}
    for weight, subredditIndex in userIndexToSubreddits[userIdToIndex[userId]]:
        oldSubreddits[indexToSubredditId[subredditIndex]] = weight
    recs = pixie.getRecs(oldSubreddits,
                         subredditIdToIndex,
                         indexToSubredditId,
                         userIndexToSubreddits,
                         subredditIndexToUsers,
                         N, None, alpha)
    
    # Precision at 10.
    goodRec10Count = 0
    for _, rec in recs[:10]:
        if rec in userIdToNewSubreddits[userId]:
            goodRec10Count += 1
    precision10 += goodRec10Count / 10.
    
    # Mean Reciprocal Rank.
    for rank, (_, rec) in enumerate(recs, 1):
        if rec in userIdToNewSubreddits[userId]:
            mrrMetric += 1. / rank
            break
    
    # Mean Average Precision.
    correctSoFar = 0
    numActual = float(len(userIdToNewSubreddits[userId]))
    for rank, (_, rec) in enumerate(recs, 1):
        if rec in userIdToNewSubreddits[userId]:
            correctSoFar += 1
            mapMetric += correctSoFar / numActual / rank

totalUsers = float(len(userIdToOldSubreddits))
print "N: {}, alpha: {}, precision@10: {}, mrr: {}, map: {}".format(
    N,
    alpha,
    precision10 / totalUsers,
    mrrMetric / totalUsers,
    mapMetric / totalUsers)

Processing 0 elizle
Processing 1 spikespaz
Processing 2 Woodpecker16669
Processing 3 Dumpstertrash1
Processing 4 Avalollk
Processing 6 Zhangathan_Jon
Processing 7 mooniesoloonie
Processing 8 TorreTiger25
Processing 10 thomasmagnum
Processing 13 Deridex3101
Processing 14 absolince
Processing 15 farkhipov
Processing 16 UnvaccinatedAutist
Processing 17 pandillasexo
Processing 18 show_me_the
Processing 19 elbowe21
Processing 20 BigAbbott
Processing 21 AssasinButt
Processing 24 Scissor_Runner12
Processing 25 luck_panda
Processing 26 coheedcollapse
Processing 27 thomasd4nkengine
Processing 28 BelievingEal21
Processing 31 Brunsy89
Processing 32 BrownMan97
Processing 33 art_hoe1
Processing 35 harpuajim25
Processing 36 Pro_phet
Processing 37 iamliterallysatan
Processing 38 GTL5427
Processing 39 FocusedADD
Processing 40 habibiiiiiii
Processing 41 Benjii117
Processing 42 ZombieJesus1987
Processing 44 msammy07
Processing 45 Vag_Assasin
Processing 46 Sockoram
Processing 47 nerveless
Processing 48 h

In [9]:
print userIndexToSubreddits[1]
print indexToUserId[1]

[(0.5714285714285714, 12707), (0.2857142857142857, 29179), (0.14285714285714285, 53376)]
truelose
