# Joining data from various files
In this file, data from various posts are joined and exported possibly as single dataframe, so expensive joining operation will not have to be performed more than required

Notice: reading from uncompressed files to allow splitting to blocks

In [1]:
# import dask bag
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [2]:
!ls -la *.csv

-rw-rw-r--. 1 wojtek wojtek   11813382 01-26 13:52 BountyFrame-0.csv
-rw-r--r--. 1 wojtek wojtek   38969687 01-28 10:18 DuplicateFrame-0.csv
-rw-r--r--. 1 wojtek wojtek 2052407604 01-28 10:09 PostsFrame-0.csv
-rw-r--r--. 1 wojtek wojtek 1924302938 01-18 11:49 PostsFrame-0_old.csv
-rw-rw-r--. 1 wojtek wojtek    1476262 01-13 21:05 TagsFrame-0.csv
-rw-r--r--. 1 wojtek wojtek  739021521 01-28 10:14 UsersFrame-0.csv


# Data loading

In [3]:
# posts
# posts = dd.read_csv("PostsFrame-0.csv.tar.xz",compression="xz",blocksize=None)
posts = dd.read_csv("PostsFrame-0.csv")

# remove unnecessary dataframe columns
posts = posts.drop(['CreationDate','LastActivityDate'], axis=1)

# smaller frame for testing purposes
#test_posts = posts.sample(frac=0.01) # comment to preserve some memory

In [4]:
# users
#users = dd.read_csv("UsersFrame-0.csv.tar.xz",compression="xz",blocksize=None)
users = dd.read_csv("UsersFrame-0.csv")

# remove unnecessary dataframe columns
users = users.drop(['DisplayName','CreationDate','LastAccessDate'], axis=1)

In [5]:
# duplicates
#duplicate = dd.read_csv("DuplicateFrame-0.csv.tar.xz",compression="xz",blocksize=None)
duplicate = dd.read_csv("DuplicateFrame-0.csv")

# remove unnecessary dataframe columns
duplicate = duplicate.drop(['Id','CreationDate','LinkTypeId','RelatedPostId'], axis=1)

In [6]:
duplicate.head()

Unnamed: 0,PostId
0,70714
1,86947
2,97679
3,99408
4,26925


In [7]:
# tags
#tags = dd.read_csv("TagsFrame-0.csv.tar.xz",compression="xz",blocksize=None)
tags = dd.read_csv("TagsFrame-0.csv")

In [8]:
# bounty
# bounty = dd.read_csv("BountyFrame-0.csv.tar.xz",compression="xz",blocksize=None)
bounty = dd.read_csv("BountyFrame-0.csv")

# remove unnecessary dataframe columns
bounty = bounty.drop(['VoteTypeId','CreationDate', 'Id'], axis=1)

# add a column to the bounty frame 
# we join this with the posts later
bounty = bounty.assign(hasBounty = True)

In [9]:
# badges
badgesAggregate = dd.read_csv("BadgesAggregatedFrame-0.csv.tar.xz",compression="xz",blocksize=None)
#badgesAggregate = dd.read_csv("BadgesAggregatedFrame-0.csv")

In [10]:
# join users with badges count
usersWithBadges = users.set_index('Id').join(badgesAggregate.set_index('UserId'))

In [11]:
usersWithBadges.head()

Unnamed: 0_level_0,Reputation,Views,UpVotes,DownVotes,BadgesCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,1,649,304659,1072783,1.0
1,45009,431337,3370,1310,332.0
2,3509,24377,653,88,92.0
3,13503,24701,7349,100,131.0
4,28953,74458,798,96,187.0


Note: user id -1 is so called community user, we are dropping this one

# Calculations

In [12]:
# join posts with bounty info(result is an extra column with True and NaN values, fix this later)
posts = posts.set_index('Id').join(bounty.set_index('PostId'))

In [13]:
# handle bounties
# this fixes all the NaN values in the bounty column
# puts them to False
def fixBounty(row):
    
    bountyVal = row.hasBounty
    
    # this checks if the value of hasBounty
    # for the current row is NaN
    if(bountyVal != bountyVal):
        return(False)
    
    return bountyVal
        
posts['bounty'] = posts.apply(fixBounty, axis=1)

# remove joined column, we will use the new one with False/True instead(cleaned from NaN values)
posts = posts.drop('hasBounty', axis=1)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [14]:
posts = posts.join(usersWithBadges, on="OwnerUserId")

In [15]:
posts = posts.fillna(0)

In [16]:
posts.head()

Unnamed: 0,PostTypeId,Score,ViewCount,WordCount,LinksCount,BlocksCount,TitleWordsCount,Tags,AnswerCount,CommentCount,OwnerUserId,AcceptedAnswerId,bounty,Reputation,Views,UpVotes,DownVotes,BadgesCount
52779155,1,0,19,75,0,2,4,<python><matplotlib><ternary>,0,0,458661,0,False,521.0,107.0,55.0,20.0,31.0
52779193,1,1,31,37,0,4,10,<regex><fish>,1,1,636987,52779263,False,2146.0,866.0,416.0,33.0,161.0
52779197,1,-2,17,66,0,1,13,<mysql>,2,4,310787,52779444,False,376.0,108.0,45.0,1.0,42.0
52779240,1,2,22,69,0,1,6,<bash><quotes><double-quotes><quote><quoting>,1,5,616349,0,False,731.0,107.0,151.0,1.0,42.0
52779284,1,1,31,32,0,3,13,<fish>,1,2,636987,52779864,False,2146.0,866.0,416.0,33.0,161.0


In [17]:
# tags dictionary with tagname and tagcount
# we create a dictionary of all the tags and their count

tagDict = dict()
for row in tags.itertuples():
    tagName = getattr(row,'TagName')
    tagCount = getattr(row,'Count')
    
    tagDict[tagName]=tagCount

In [18]:
# tags handling
# accumulating tags value for each list of tags for each row
    
def calculateTagsAggregate(row):
    tags = row.Tags
    
    # split the tag string by the closing tag
    tagsList = tags.split('>')
    
    # remove the last empty element
    # which the prev function creates
    tagsList = tagsList[:-1]
    
    finList = list()
    # clear the opening tag from each element
    finList = [item[1:] for item in tagsList]
    
    # start counting with the use of the dict
    tag_sum = 0
    for tag in finList:
        tagVal = tagDict.get(tag)
        
        if(tagVal == None):
            tagVal = 0
        
        tag_sum += tagVal
        
    return(tag_sum)
    
    
posts['tagsAggregate'] = posts.apply(calculateTagsAggregate, axis=1)

# drop tags column from frame since no longer necessary
posts = posts.drop('Tags', axis=1)

print('Done')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


Done


In [19]:
# change answerID to 0/1 (answered/not answered)
def changeAnswered(row):
    answerID = row.AcceptedAnswerId
    
    if(answerID==0):
        return False
    else:
        return True
    
posts['isAnswered'] = posts.apply(changeAnswered, axis=1)
posts = posts.drop('AcceptedAnswerId', axis=1)
posts = posts.drop("PostTypeId", axis=1)

In [20]:
posts.head(n=20, npartitions=2)

Unnamed: 0,Score,ViewCount,WordCount,LinksCount,BlocksCount,TitleWordsCount,AnswerCount,CommentCount,OwnerUserId,bounty,Reputation,Views,UpVotes,DownVotes,BadgesCount,tagsAggregate,isAnswered
52042747,2,44,166,4,10,14,1,0,489426,False,1384.0,138.0,105.0,9.0,71.0,280810,True
52042790,0,56,18,1,2,4,1,0,19212,False,35982.0,2243.0,6926.0,38.0,519.0,1142114,True
52042796,-1,50,1217,3,1,10,1,6,468523,False,58.0,36.0,79.0,0.0,11.0,1789446,True
52042814,0,41,225,2,0,8,1,0,266553,False,3194.0,306.0,470.0,154.0,108.0,2898,False
52042821,0,46,93,0,1,5,1,1,580175,False,638.0,215.0,70.0,12.0,45.0,866855,True
52042868,0,11,100,0,10,11,0,0,785523,False,1475.0,327.0,270.0,13.0,72.0,48489,False
52042911,0,6,88,0,3,5,0,0,657224,False,1194.0,62.0,144.0,5.0,23.0,892,False
52042997,0,40,138,0,2,17,0,1,0,False,0.0,0.0,0.0,0.0,0.0,1292267,False
52043143,1,15,40,0,2,2,1,0,0,False,0.0,0.0,0.0,0.0,0.0,130097,True
52043149,0,31,47,0,3,11,3,4,0,False,0.0,0.0,0.0,0.0,0.0,2659995,True


In [21]:
posts.dtypes

Score              int64
ViewCount          int64
WordCount          int64
LinksCount         int64
BlocksCount        int64
TitleWordsCount    int64
AnswerCount        int64
CommentCount       int64
OwnerUserId        int64
bounty              bool
Reputation         int64
Views              int64
UpVotes            int64
DownVotes          int64
BadgesCount        int64
tagsAggregate      int64
isAnswered          bool
dtype: object

In [22]:
!rm -rf Complete-Posts
!mkdir Complete-Posts
!rm -rf Complete-Posts-Compressed
!mkdir Complete-Posts-Compressed

In [23]:
%%time
# store data to csv file, to prevent joining all the time
posts.to_csv("Complete-Posts/*.csv",index=False)

CPU times: user 21min 40s, sys: 33.6 s, total: 22min 14s
Wall time: 21min 12s


['Complete-Posts/00.csv',
 'Complete-Posts/01.csv',
 'Complete-Posts/02.csv',
 'Complete-Posts/03.csv',
 'Complete-Posts/04.csv',
 'Complete-Posts/05.csv',
 'Complete-Posts/06.csv',
 'Complete-Posts/07.csv',
 'Complete-Posts/08.csv',
 'Complete-Posts/09.csv',
 'Complete-Posts/10.csv',
 'Complete-Posts/11.csv',
 'Complete-Posts/12.csv',
 'Complete-Posts/13.csv',
 'Complete-Posts/14.csv',
 'Complete-Posts/15.csv',
 'Complete-Posts/16.csv',
 'Complete-Posts/17.csv',
 'Complete-Posts/18.csv',
 'Complete-Posts/19.csv',
 'Complete-Posts/20.csv',
 'Complete-Posts/21.csv',
 'Complete-Posts/22.csv',
 'Complete-Posts/23.csv']

In [24]:
!ls Complete-Posts/*csv

Complete-Posts/00.csv  Complete-Posts/08.csv  Complete-Posts/16.csv
Complete-Posts/01.csv  Complete-Posts/09.csv  Complete-Posts/17.csv
Complete-Posts/02.csv  Complete-Posts/10.csv  Complete-Posts/18.csv
Complete-Posts/03.csv  Complete-Posts/11.csv  Complete-Posts/19.csv
Complete-Posts/04.csv  Complete-Posts/12.csv  Complete-Posts/20.csv
Complete-Posts/05.csv  Complete-Posts/13.csv  Complete-Posts/21.csv
Complete-Posts/06.csv  Complete-Posts/14.csv  Complete-Posts/22.csv
Complete-Posts/07.csv  Complete-Posts/15.csv  Complete-Posts/23.csv


In [25]:
%%time
# same as above but compressed
posts.to_csv("Complete-Posts-Compressed/*.csv.tar.xz",compression="xz",index=False)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


['Complete-Posts-Compressed/00.csv.tar.xz',
 'Complete-Posts-Compressed/01.csv.tar.xz',
 'Complete-Posts-Compressed/02.csv.tar.xz',
 'Complete-Posts-Compressed/03.csv.tar.xz',
 'Complete-Posts-Compressed/04.csv.tar.xz',
 'Complete-Posts-Compressed/05.csv.tar.xz',
 'Complete-Posts-Compressed/06.csv.tar.xz',
 'Complete-Posts-Compressed/07.csv.tar.xz',
 'Complete-Posts-Compressed/08.csv.tar.xz',
 'Complete-Posts-Compressed/09.csv.tar.xz',
 'Complete-Posts-Compressed/10.csv.tar.xz',
 'Complete-Posts-Compressed/11.csv.tar.xz',
 'Complete-Posts-Compressed/12.csv.tar.xz',
 'Complete-Posts-Compressed/13.csv.tar.xz',
 'Complete-Posts-Compressed/14.csv.tar.xz',
 'Complete-Posts-Compressed/15.csv.tar.xz',
 'Complete-Posts-Compressed/16.csv.tar.xz',
 'Complete-Posts-Compressed/17.csv.tar.xz',
 'Complete-Posts-Compressed/18.csv.tar.xz',
 'Complete-Posts-Compressed/19.csv.tar.xz',
 'Complete-Posts-Compressed/20.csv.tar.xz',
 'Complete-Posts-Compressed/21.csv.tar.xz',
 'Complete-Posts-Compressed/22.c

In [26]:
!ls Complete-Posts-Compressed/*tar.xz

Complete-Posts-Compressed/00.csv.tar.xz
Complete-Posts-Compressed/01.csv.tar.xz
Complete-Posts-Compressed/02.csv.tar.xz
Complete-Posts-Compressed/03.csv.tar.xz
Complete-Posts-Compressed/04.csv.tar.xz
Complete-Posts-Compressed/05.csv.tar.xz
Complete-Posts-Compressed/06.csv.tar.xz
Complete-Posts-Compressed/07.csv.tar.xz
Complete-Posts-Compressed/08.csv.tar.xz
Complete-Posts-Compressed/09.csv.tar.xz
Complete-Posts-Compressed/10.csv.tar.xz
Complete-Posts-Compressed/11.csv.tar.xz
Complete-Posts-Compressed/12.csv.tar.xz
Complete-Posts-Compressed/13.csv.tar.xz
Complete-Posts-Compressed/14.csv.tar.xz
Complete-Posts-Compressed/15.csv.tar.xz
Complete-Posts-Compressed/16.csv.tar.xz
Complete-Posts-Compressed/17.csv.tar.xz
Complete-Posts-Compressed/18.csv.tar.xz
Complete-Posts-Compressed/19.csv.tar.xz
Complete-Posts-Compressed/20.csv.tar.xz
Complete-Posts-Compressed/21.csv.tar.xz
Complete-Posts-Compressed/22.csv.tar.xz
Complete-Posts-Compressed/23.csv.tar.xz


In [27]:
#posts.astype(float).corr().compute()