In [7]:
# import dask bag
import dask.dataframe as dd
import numpy as np
import pandas as pd
    

Data loading:

In [8]:
# posts
posts = dd.read_csv("PostsFrame-0.csv.tar.xz",compression="xz",blocksize=1024000)

# remove unnecessary dataframe columns
posts = posts.drop(['CreationDate','LastActivityDate'], axis=1)

# smaller frame for testing purposes
#test_posts = posts.sample(frac=0.01) # comment to preserve some memory


Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  "Setting ``blocksize=None``" % compression)


In [9]:
# users
#users = dd.read_csv("UsersFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
#users = users.drop(['DisplayName','CreationDate','LastAccessDate'], axis=1)


In [10]:
# duplicates
#duplicate = dd.read_csv("DuplicateFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
#duplicate = duplicate.drop(['CreationDate','LinkTypeId'], axis=1)

In [11]:
# tags
tags = dd.read_csv("TagsFrame-0.csv.tar.xz",compression="xz",blocksize=None)

In [12]:
# bounty
bounty = dd.read_csv("BountyFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
bounty = bounty.drop(['VoteTypeId','CreationDate', 'Id'], axis=1)

# add a column to the bounty frame 
# we join this with the posts later
bounty = bounty.assign(hasBounty = True)

In [13]:
# badges
#badgesAggregate = dd.read_csv("BadgesAggregatedFrame-0.csv.tar.xz",compression="xz",blocksize=None)

In [14]:
# join users with badges count
#usersWithBadges = users.set_index('Id').join(badgesAggregate.set_index('UserId'))

Calculations:

In [15]:
# join posts with bounty info(result is an extra column with True and NaN values, fix this later)
posts = posts.set_index('Id').join(bounty.set_index('PostId'))

In [16]:
# handle bounties
# this fixes all the NaN values in the bounty column
# puts them to False
def fixBounty(row):
    
    bountyVal = row.hasBounty
    
    # this checks if the value of hasBounty
    # for the current row is NaN
    if(bountyVal != bountyVal):
        return(False)
    
    return bountyVal
        
posts['bounty'] = posts.apply(fixBounty, axis=1)

# remove joined column, we will use the new one with False/True instead(cleaned from NaN values)
posts = posts.drop('hasBounty', axis=1)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [17]:
# tags dictionary with tagname and tagcount
# we create a dictionary of all the tags and their count

tagDict = dict()
for row in tags.itertuples():
    tagName = getattr(row,'TagName')
    tagCount = getattr(row,'Count')
    
    tagDict[tagName]=tagCount

In [18]:
# tags handling
# accumulating tags value for each list of tags for each row
    
def calculateTagsAggregate(row):
    tags = row.Tags
    
    # split the tag string by the closing tag
    tagsList = tags.split('>')
    
    # remove the last empty element
    # which the prev function creates
    tagsList = tagsList[:-1]
    
    finList = list()
    # clear the opening tag from each element
    finList = [item[1:] for item in tagsList]
    
    # start counting with the use of the dict
    tag_sum = 0
    for tag in finList:
        tagVal = tagDict.get(tag)
        
        if(tagVal == None):
            tagVal = 0
        
        tag_sum += tagVal
        
    return(tag_sum)
    
    
posts['tagsAggregate'] = posts.apply(calculateTagsAggregate, axis=1)

# drop tags column from frame since no longer necessary
posts = posts.drop('Tags', axis=1)

print('Done')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


Done


In [19]:
# change answerID to 0/1 (answered/not answered)
def changeAnswered(row):
    answerID = row.AcceptedAnswerId
    
    if(answerID==0):
        return False
    else:
        return True
    
posts['isAnswered'] = posts.apply(changeAnswered, axis=1)
posts = posts.drop('AcceptedAnswerId', axis=1)



  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [20]:
# posts.head(n=20, npartitions=2)

Unnamed: 0,PostTypeId,Score,ViewCount,WordCount,LinksCount,BlocksCount,TitleWordsCount,AnswerCount,CommentCount,bounty,tagsAggregate,isAnswered
6,1,261,16799,91,1,3,12,6,0,False,1434581,True
9,1,1743,500536,15,0,1,8,63,7,True,1587941,True
11,1,1382,139190,21,0,1,5,35,3,False,1333380,True
13,1,552,160858,32,0,1,4,24,8,False,820908,False
14,1,381,111934,9,2,2,5,10,4,True,308305,False
16,1,114,79712,87,0,7,11,6,0,False,1401702,True
17,1,168,67267,9,1,0,4,9,3,True,694363,True
17,1,168,67267,9,1,0,4,9,3,True,694363,True
19,1,288,43495,167,0,13,11,23,16,False,216694,True
24,1,147,64775,21,0,2,7,6,0,False,707280,True


Plotting :

In [15]:
import matplotlib.pyplot as plt

In [42]:
# load under these variables the Series that you use
# in the plot that you are about to draw
param1 = posts['bounty'].compute() # example
param2 = posts['BlocksCount'].compute() # example

# this block takes 5-10minutes



In [None]:
answered = posts['isAnswered'].compute() # this is the main series we need, we use it as factor 
# this block also takes 5 minutes or so