### Imports

In [1]:
import collections
import cPickle as pickle
import itertools
import gc
import json
from matplotlib import pyplot as plt
import numpy as np
import os
import pixie
import random
import snap
import time
import tools

### Preprocess

Prune users > 200 old subreddits

In [2]:
# Load inputGraph, include weights.
userIdToOldSubreddits = collections.defaultdict(lambda: collections.defaultdict(int))
tools.getUserIdToSubreddits("../bigData/finalGeneration/inputGraph", userIdToOldSubreddits, includeCounts=True)
print "Number of users: {}".format(len(userIdToOldSubreddits)) 

Processing ../bigData/finalGeneration/inputGraph
Processed 1000000
Processed 2000000
Processed 3000000
Processed 4000000
Processed 5000000
Processed 6000000
Processed 7000000
Processed 8000000
Processed 9000000
Processed 10000000
Processed 11000000
Processed 12000000
Processed 13000000
Processed 14000000
Processed 15000000
Processed 16000000
Processed 17000000
Processed 18000000
Processed 19000000
Processed 20000000
Processed 21000000
Processed 22000000
Processed 23000000
Processed 24000000
Processed 25000000
Processed 26000000
Processed 27000000
Processed 28000000
Processed 29000000
Processed 30000000
Processed 31000000
Processed 32000000
Processed 33000000
Processed 34000000
Processed 35000000
Processed 36000000
Processed 37000000
Processed 38000000
Processed 39000000
Processed 40000000
Processed 41000000
Processed 42000000
Processed 43000000
Processed 44000000
Processed 45000000
Processed 46000000
Processed 47000000
Processed 48000000
Processed 49000000
Processed 50000000
Processed 

In [3]:
users = set()
subreddits = set()
for userId, oldSubreddits in userIdToOldSubreddits.iteritems():
    users.add(userId)
    for subredditId in oldSubreddits:
        subreddits.add(subredditId)

In [4]:
users = set()
subreddits = set()
for userId, oldSubreddits in userIdToOldSubreddits.iteritems():
    if len(oldSubreddits) <= 200:
        users.add(userId)
        for subredditId in oldSubreddits:
            subreddits.add(subredditId)

In [4]:
# Save the mappings.
with open("../bigData/pixie/indexToUserId", 'w') as outfile:
    for userId in users:
        outfile.write(userId + "\n")
with open("../bigData/pixie/indexToSubredditId", 'w') as outfile:
    for subredditId in subreddits:
        outfile.write(subredditId + "\n")

In [2]:
# Load the two mappings.
indexToUserId, userIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToUserId")
indexToSubredditId, subredditIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToSubredditId")

# Throw into lists.
userIndexToSubreddits = [[] for i in range(len(indexToUserId))]
subredditIndexToUsers = [[] for i in range(len(indexToSubredditId))]
with open("../bigData/finalGeneration/inputGraph", 'r') as f:
    for i, line in enumerate(f, 1):
        userId, subredditId, commentCountStr = line.split()
        if userId not in userIdToIndex or subredditId not in subredditIdToIndex:
            continue
        
        userIndex = userIdToIndex[userId]
        subredditIndex = subredditIdToIndex[subredditId]
        
        userIndexToSubreddits[userIndex].append(subredditIndex)
        subredditIndexToUsers[subredditIndex].append(userIndex)
        if i % 1000000 == 0:
            print "Processed {}".format(i)
print "Users: {}, Subreddits: {}".format(len(userIndexToSubreddits), len(subredditIndexToUsers))

Processed 1000000
Processed 2000000
Processed 3000000
Processed 4000000
Processed 5000000
Processed 6000000
Processed 7000000
Processed 8000000
Processed 9000000
Processed 10000000
Processed 11000000
Processed 12000000
Processed 13000000
Processed 14000000
Processed 15000000
Processed 16000000
Processed 17000000
Processed 18000000
Processed 19000000
Processed 20000000
Processed 21000000
Processed 22000000
Processed 23000000
Processed 24000000
Processed 25000000
Processed 26000000
Processed 27000000
Processed 28000000
Processed 29000000
Processed 30000000
Processed 31000000
Processed 32000000
Processed 33000000
Processed 34000000
Processed 35000000
Processed 36000000
Processed 37000000
Processed 38000000
Processed 39000000
Processed 40000000
Processed 41000000
Processed 42000000
Processed 43000000
Processed 44000000
Processed 45000000
Processed 46000000
Processed 47000000
Processed 48000000
Processed 49000000
Processed 50000000
Processed 51000000
Processed 52000000
Users: 4052716, Subre

In [9]:
# Output.
with open("../bigData/pixie/userIndexToSubreddits.pkl", 'w') as outfile:
    pickle.dump(userIndexToSubreddits, outfile)
with open("../bigData/pixie/subredditIndexToUsers.pkl", 'w') as outfile:
    pickle.dump(subredditIndexToUsers, outfile)
print "Done"

Done


### Model

In [8]:
# Load everything.
print "Loading user id indexes"
indexToUserId, userIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToUserId")
print "Loading subreddit id indexes"
indexToSubredditId, subredditIdToIndex = tools.loadIndexToUserId("../bigData/pixie/indexToSubredditId")
print "Loading user id to old subreddits"
userIdToOldSubreddits = tools.getUserIdToSubredditsByType("../bigData/finalGeneration/devUsers", "oldSubreddits")
print "Loading user id to new subreddits"
userIdToNewSubreddits = tools.getUserIdToSubredditsByType("../bigData/finalGeneration/devUsers", "newSubreddits")
print "Loading subreddit id to name"
subredditIdToName = tools.read_subreddit_names("../bigData/subredditIdToName")

Loading user id indexes
Loading subreddit id indexes
Loading user id to old subreddits
Loading user id to new subreddits
Loading subreddit id to name


In [3]:
print "Loading user index to subreddits"
userIndexToSubreddits = None
with open("../bigData/pixie/userIndexToSubreddits.pkl", 'r') as infile:
    userIndexToSubreddits = pickle.load(infile)
print "Loading subreddit index to user"
subredditIndexToUsers = None
with open("../bigData/pixie/subredditIndexToUsers.pkl", 'r') as infile:
    subredditIndexToUsers = pickle.load(infile)
print "Done"

Loading user index to subreddits
Loading subreddit index to user
Done


In [9]:
# Reload module and set seeds.
reload(pixie)
random.seed(7224)
np.random.seed(7224)

# Parameters.
N = 100000
alpha = 0.5

# Get recs.
precision10 = 0
mrrMetric = 0
mapMetric = 0
numUsers = 0
totalTime = 0
for i, (userId, oldSubreddits) in enumerate(userIdToOldSubreddits.iteritems()):    
    # print "Processing {} {}".format(i, userId)
    start = time.time()
    recs = pixie.getRecs(oldSubreddits,
                         subredditIdToIndex,
                         indexToSubredditId,
                         userIndexToSubreddits,
                         subredditIndexToUsers,
                         N, None, alpha)
    totalTime += time.time() - start
    
    # Precision at 10.
    goodRec10Count = 0
    for _, rec in recs[:10]:
        if rec in userIdToNewSubreddits[userId]:
            goodRec10Count += 1
    precision10 += goodRec10Count / 10.
    
    # Mean Reciprocal Rank.
    for rank, (_, rec) in enumerate(recs, 1):
        if rec in userIdToNewSubreddits[userId]:
            mrrMetric += 1. / rank
            break
    
    # Mean Average Precision.
    correctSoFar = 0
    numActual = float(len(userIdToNewSubreddits[userId]))
    for rank, (_, rec) in enumerate(recs, 1):
        if rec in userIdToNewSubreddits[userId]:
            correctSoFar += 1
            mapMetric += correctSoFar / numActual / rank
    
    # Print out.
    print "Processing {} {} {}".format(i, userId, goodRec10Count)
    print "--------------collab------------------"
    line = ""
    for rec in recs[:10]:
        line += " {} ".format(subredditIdToName[rec[1]])
    print line
    print "--------------------------------------"
    print ""


totalUsers = float(len(userIdToOldSubreddits))
print "N: {}, alpha: {}, precision@10: {}, mrr: {}, map: {}".format(
    N,
    alpha,
    precision10 / totalUsers,
    mrrMetric / totalUsers,
    mapMetric / totalUsers)
print "{}".format(totalTime / totalUsers)

Processing 0 plastictir2 0
--------------collab------------------
 funny  gaming  pics  todayilearned  gifs  worldnews  videos  aww  news  Showerthoughts 
--------------------------------------

Processing 1 batmansleftnut 1
--------------collab------------------
 AskReddit  gaming  gifs  worldnews  aww  mildlyinteresting  Showerthoughts  movies  interestingasfuck  OldSchoolCool 
--------------------------------------

Processing 2 kraftjerk416 0
--------------collab------------------
 gaming  todayilearned  worldnews  news  Showerthoughts  mildlyinteresting  movies  politics  WTF  OldSchoolCool 
--------------------------------------

Processing 3 zero_space 1
--------------collab------------------
 gaming  pics  worldnews  gifs  news  movies  Showerthoughts  aww  FortNiteBR  interestingasfuck 
--------------------------------------

Processing 4 Airazz 1
--------------collab------------------
 gaming  gifs  aww  politics  OldSchoolCool  BlackPeopleTwitter  television  trashy  AdviceA

Processing 39 General_Butt_Nekked 0
--------------collab------------------
 pics  news  worldnews  politics  OldSchoolCool  interestingasfuck  television  FortNiteBR  Jokes  technology 
--------------------------------------

Processing 40 NoiseGener8r 0
--------------collab------------------
 pics  todayilearned  gifs  mildlyinteresting  news  aww  worldnews  videos  movies  politics 
--------------------------------------

Processing 41 nosnevenaes 3
--------------collab------------------
 funny  gifs  mildlyinteresting  news  worldnews  videos  movies  politics  WTF  television 
--------------------------------------

Processing 42 Ur_Local_Gopnik 4
--------------collab------------------
 AskReddit  funny  pics  gaming  todayilearned  Showerthoughts  aww  gifs  worldnews  news 
--------------------------------------

Processing 43 beenhereallalong52 2
--------------collab------------------
 funny  gaming  todayilearned  gifs  worldnews  videos  news  mildlyinteresting  Showerthought

Processing 78 BlueRose85 3
--------------collab------------------
 pics  funny  todayilearned  gaming  gifs  aww  videos  mildlyinteresting  news  worldnews 
--------------------------------------

Processing 79 Kailu 0
--------------collab------------------
 AskReddit  gaming  news  aww  mildlyinteresting  interestingasfuck  OldSchoolCool  television  nottheonion  oddlysatisfying 
--------------------------------------

Processing 80 aloysiuslamb 0
--------------collab------------------
 worldnews  news  Showerthoughts  television  OldSchoolCool  oddlysatisfying  IAmA  trashy  Music  FortNiteBR 
--------------------------------------

Processing 81 wavefunction56 1
--------------collab------------------
 pics  funny  gaming  todayilearned  gifs  aww  mildlyinteresting  Showerthoughts  videos  news 
--------------------------------------

Processing 82 shittyguitar 0
--------------collab------------------
 pics  gaming  aww  news  worldnews  movies  interestingasfuck  OldSchoolCool  Bl

In [6]:
for i, (userId, oldSubreddits) in enumerate(userIdToOldSubreddits.iteritems()):
    print "Processing {} {}".format(i, userId)
    print "--------------------------------------"
    correctAnswers = userIdToNewSubreddits[userId]
    line = ""
    for answer in correctAnswers:
        line += " {} ".format(subredditIdToName[answer])
    print "--------------------------------------"
    
    print "--------------------------------------"
    print ""
print line

 confession  UpliftingNews  blunderyears  facepalm  sysadmin  HighQualityGifs  FearTheWalkingDead  skyrim  LivestreamFail  Wellthatsucks  BeAmazed  fo76  MildlyVandalised  gatekeeping  Tinder  shittyrobots  Eyebleach  me_irl  bestof  iamverysmart  Prematurecelebration  creepyPMs  IAmA  RussiaLago  DadReflexes  cursedimages  starterpacks  DesignPorn  trippinthroughtime  antiMLM  marvelstudios  hmmm  IASIP  equelMemes  politics  Whatcouldgowrong  trashpandas  whitepeoplegifs  sadcringe  niceguys  CrappyDesign  BikiniBottomTwitter  Unexpected  MapPorn  Fallout  Zoomies  Marvel  trashyboners  GlitchInTheMatrix  geek  LearnUselessTalents  mechanical_gifs  Breath_of_the_Wild  IncelTears  gifsthatkeepongiving  iamatotalpieceofshit  WhitePeopleTwitter  PS4  CringeAnarchy  assholedesign  goldenretrievers  houston  chemicalreactiongifs  reactiongifs  insanepeoplefacebook  terriblefacebookmemes 


In [8]:
mostPopular = ['t5_3i30o', 't5_3pi28', 't5_2srd1', 't5_2xykh', 't5_2xk5x', 't5_3ns8a', 't5_3dyt3', 't5_2qh1n', 't5_2qh35', 't5_3hknn']
line = ""
for answer in mostPopular:
    line += " {} ".format(subredditIdToName[answer])
    if answer in correctAnswers:
        print "1"
print line

 CanadianMOMs  calledshotpodcast  electricdaisycarnival  DestinyPC  mentalpod  shamcoin  WidowmakerMains  environment  lisp  eternium 
