Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

file 118 lines (104 sloc) 3.577 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
from HTMLParser import HTMLParser
import re
import copy
import json
import os

def stripPage(page):
    regExp = "<table border=0><tr>.+ height=1 width=(\d+)></td>.+<a href=\"user.id=(\w+).+ago \| <a href=\"(.+)\">link.+\n<span class=\"comment\"><font color=#[0-9a-zA-Z]+>(.+)</font></span>"
    m = re.findall(regExp,page)
    levels = map(lambda (strNum,name,link,comment): (int(strNum)/40,name,{'link': link, 'comment': comment, 'user': name}),m)
    return levels

def levelToTree(level,base):
    node = {'names': [level[0][1]],
            'comments': [level[0][2]],
            'children': levelsToTree(level[1:],base+1)}
    return node

def levelsToTree(levels,base):
    commentTrees = []
    if len(levels)==0:
        return
    current = [levels[0]]
    for piece in levels[1:]:
        if piece[0] != base:
            current.append(piece)
        else:
            commentTrees.append(levelToTree(current,base))
            current = [piece]
    commentTrees.append(levelToTree(current,base))
    return commentTrees

def combineTrees(oldTreeA,oldTreeB):
    treeA = copy.deepcopy(oldTreeA)
    treeB = copy.deepcopy(oldTreeB)
    if treeA == None:
        return treeB
    elif treeB == None:
        return treeA
    elif treeA['children'] == None:
        treeB['names'].extend(treeA['names'])
        treeB['comments'].extend(treeA['comments'])
        return treeB
    elif treeB['children'] == None:
        treeA['names'].extend(treeB['names'])
        treeA['comments'].extend(treeB['comments'])
        return treeA
    else:
        names = treeA['names']
        names.extend(treeB['names'])
        childrenA= treeA['children']
        childrenB= treeB['children']

        if len(childrenA)>len(childrenB):
            diff = len(childrenA)-len(childrenB)
            filler = map(lambda x: None, range(diff))
            childrenB.extend(filler)

        elif len(childrenB)>len(childrenA):
            diff = len(childrenB)-len(childrenA)
            filler = map(lambda x: None, range(diff))
            childrenA.extend(filler)
    
        children = map( lambda i: combineTrees(childrenA[i],childrenB[i])
                        ,range(len(childrenA)))
        newComments = treeA['comments']
        newComments.extend(treeB['comments'])
        newTree = { 'names': names,
                    'children': children,
                    'comments': newComments}
        return newTree

def mode(list):
    cat= {}
    for item in list:
        if item in cat:
            cat[item] += 1
        else:
            cat[item]=1
    topItem= None
    topRank= -1
    items = cat.items()
    for item in items:
        if item[1] > topRank:
            (topItem, topRank)= item
    return topItem
        
def averageTree(tree):
    tree['number']=len(tree['names'])
    tree['names']=mode(tree['names'])
    if tree['children']:
        map(averageTree,tree['children'])

users=["edw519","patio11","pg","tptacek","cwan"]

def process():
    allTogether = {}
    for user in users:
        print user
        files = os.listdir(os.getcwd()+"/rawpages/"+user)
        trees=[]
        for file in files:
            print file
            page = open("rawpages/"+user+"/"+file).read()
            trees.extend(levelsToTree(stripPage(page),0))
        onlySoMany= trees[0:50]
        overlayTree = reduce(combineTrees,onlySoMany)
        averageTree(overlayTree)
        allTogether[user]=overlayTree
    return allTogether

m=process()

temp = open("users.json","w")
temp.write(json.dumps(m))
temp.close()
print "Done"
      
Something went wrong with that request. Please try again.