In [1]:
# import dask bag
import dask.dataframe as dd
import numpy as np
import pandas as pd
    

Data loading:

In [2]:
# posts
posts = dd.read_csv("PostsFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
posts = posts.drop(['CreationDate','LastActivityDate'], axis=1)

# smaller frame for testing purposes
#test_posts = posts.sample(frac=0.01) # comment to preserve some memory


In [3]:
# users
#users = dd.read_csv("UsersFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
#users = users.drop(['DisplayName','CreationDate','LastAccessDate'], axis=1)


In [4]:
# duplicates
#duplicate = dd.read_csv("DuplicateFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
#duplicate = duplicate.drop(['CreationDate','LinkTypeId'], axis=1)

In [5]:
# tags
tags = dd.read_csv("TagsFrame-0.csv.tar.xz",compression="xz",blocksize=None)

In [6]:
# bounty
bounty = dd.read_csv("BountyFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
bounty = bounty.drop(['VoteTypeId','CreationDate', 'Id'], axis=1)

# add a column to the bounty frame 
# we join this with the posts later
bounty = bounty.assign(hasBounty = True)

In [7]:
# badges
#badgesAggregate = dd.read_csv("BadgesAggregatedFrame-0.csv.tar.xz",compression="xz",blocksize=None)

In [8]:
# join users with badges count
#usersWithBadges = users.set_index('Id').join(badgesAggregate.set_index('UserId'))

Calculations:

In [9]:
# join posts with bounty info(result is an extra column with True and NaN values, fix this later)
posts = posts.set_index('Id').join(bounty.set_index('PostId'))

In [10]:
# handle bounties
# this fixes all the NaN values in the bounty column
# puts them to False
def fixBounty(row):
    
    bountyVal = row.hasBounty
    
    # this checks if the value of hasBounty
    # for the current row is NaN
    if(bountyVal != bountyVal):
        return(False)
    
    return bountyVal
        
posts['bounty'] = posts.apply(fixBounty, axis=1)

# remove joined column, we will use the new one with False/True instead(cleaned from NaN values)
posts = posts.drop('hasBounty', axis=1)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [11]:
# tags dictionary with tagname and tagcount
# we create a dictionary of all the tags and their count

tagDict = dict()
for row in tags.itertuples():
    tagName = getattr(row,'TagName')
    tagCount = getattr(row,'Count')
    
    tagDict[tagName]=tagCount

Done


In [12]:
# tags handling
# accumulating tags value for each list of tags for each row
    
def calculateTagsAggregate(row):
    tags = row.Tags
    
    # split the tag string by the closing tag
    tagsList = tags.split('>')
    
    # remove the last empty element
    # which the prev function creates
    tagsList = tagsList[:-1]
    
    finList = list()
    # clear the opening tag from each element
    finList = [item[1:] for item in tagsList]
    
    # start counting with the use of the dict
    tag_sum = 0
    for tag in finList:
        tagVal = tagDict.get(tag)
        
        if(tagVal == None):
            tagVal = 0
        
        tag_sum += tagVal
        
    return(tag_sum)
    
    
posts['tagsAggregate'] = posts.apply(calculateTagsAggregate, axis=1)

# drop tags column from frame since no longer necessary
posts = posts.drop('Tags', axis=1)

print('Done')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


Done


In [15]:
posts.head(n=10, npartitions=2)


Unnamed: 0,PostTypeId,Score,ViewCount,WordCount,LinksCount,BlocksCount,TitleWordsCount,AnswerCount,CommentCount,AcceptedAnswerId,bounty,tagsAggregate
6,1,261,16799,91,1,3,12,6,0,31,False,1434581
9,1,1743,500536,15,0,1,8,63,7,1404,True,1587941
11,1,1382,139190,21,0,1,5,35,3,1248,False,1333380
13,1,552,160858,32,0,1,4,24,8,0,False,820908
14,1,381,111934,9,2,2,5,10,4,0,True,308305
16,1,114,79712,87,0,7,11,6,0,12446,False,1401702
17,1,168,67267,9,1,0,4,9,3,26,True,694363
17,1,168,67267,9,1,0,4,9,3,26,True,694363
19,1,288,43495,167,0,13,11,23,16,531,False,216694
24,1,147,64775,21,0,2,7,6,0,49,False,707280


Done
