# Joining data from various files
In this file, data from various posts are joined and exported possibly as single dataframe, so expensive joining operation will not have to be performed more than required

Notice: reading from uncompressed files to allow splitting to blocks

In [1]:
# import dask bag
import dask.dataframe as dd
import numpy as np
import pandas as pd

# Data loading

In [2]:
# posts
# posts = dd.read_csv("PostsFrame-0.csv.tar.xz",compression="xz",blocksize=None)
posts = dd.read_csv("PostsFrame-0.csv")

# remove unnecessary dataframe columns
posts = posts.drop(['CreationDate','LastActivityDate'], axis=1)

# smaller frame for testing purposes
#test_posts = posts.sample(frac=0.01) # comment to preserve some memory

In [3]:
# users
#users = dd.read_csv("UsersFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
#users = users.drop(['DisplayName','CreationDate','LastAccessDate'], axis=1)

In [4]:
# duplicates
#duplicate = dd.read_csv("DuplicateFrame-0.csv.tar.xz",compression="xz",blocksize=None)

# remove unnecessary dataframe columns
#duplicate = duplicate.drop(['CreationDate','LinkTypeId'], axis=1)

In [5]:
# tags
#tags = dd.read_csv("TagsFrame-0.csv.tar.xz",compression="xz",blocksize=None)
tags = dd.read_csv("TagsFrame-0.csv")

In [6]:
# bounty
# bounty = dd.read_csv("BountyFrame-0.csv.tar.xz",compression="xz",blocksize=None)
bounty = dd.read_csv("BountyFrame-0.csv")

# remove unnecessary dataframe columns
bounty = bounty.drop(['VoteTypeId','CreationDate', 'Id'], axis=1)

# add a column to the bounty frame 
# we join this with the posts later
bounty = bounty.assign(hasBounty = True)

In [7]:
# badges
#badgesAggregate = dd.read_csv("BadgesAggregatedFrame-0.csv.tar.xz",compression="xz",blocksize=None)

In [8]:
# join users with badges count
#usersWithBadges = users.set_index('Id').join(badgesAggregate.set_index('UserId'))

# Calculations

In [9]:
# join posts with bounty info(result is an extra column with True and NaN values, fix this later)
posts = posts.set_index('Id').join(bounty.set_index('PostId'))

In [10]:
# handle bounties
# this fixes all the NaN values in the bounty column
# puts them to False
def fixBounty(row):
    
    bountyVal = row.hasBounty
    
    # this checks if the value of hasBounty
    # for the current row is NaN
    if(bountyVal != bountyVal):
        return(False)
    
    return bountyVal
        
posts['bounty'] = posts.apply(fixBounty, axis=1)

# remove joined column, we will use the new one with False/True instead(cleaned from NaN values)
posts = posts.drop('hasBounty', axis=1)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [11]:
# tags dictionary with tagname and tagcount
# we create a dictionary of all the tags and their count

tagDict = dict()
for row in tags.itertuples():
    tagName = getattr(row,'TagName')
    tagCount = getattr(row,'Count')
    
    tagDict[tagName]=tagCount

In [12]:
# tags handling
# accumulating tags value for each list of tags for each row
    
def calculateTagsAggregate(row):
    tags = row.Tags
    
    # split the tag string by the closing tag
    tagsList = tags.split('>')
    
    # remove the last empty element
    # which the prev function creates
    tagsList = tagsList[:-1]
    
    finList = list()
    # clear the opening tag from each element
    finList = [item[1:] for item in tagsList]
    
    # start counting with the use of the dict
    tag_sum = 0
    for tag in finList:
        tagVal = tagDict.get(tag)
        
        if(tagVal == None):
            tagVal = 0
        
        tag_sum += tagVal
        
    return(tag_sum)
    
    
posts['tagsAggregate'] = posts.apply(calculateTagsAggregate, axis=1)

# drop tags column from frame since no longer necessary
posts = posts.drop('Tags', axis=1)

print('Done')

Done


In [13]:
# change answerID to 0/1 (answered/not answered)
def changeAnswered(row):
    answerID = row.AcceptedAnswerId
    
    if(answerID==0):
        return False
    else:
        return True
    
posts['isAnswered'] = posts.apply(changeAnswered, axis=1)
posts = posts.drop('AcceptedAnswerId', axis=1)
posts = posts.drop("PostTypeId", axis=1)

In [14]:
posts.head(n=20, npartitions=2)

Unnamed: 0,Score,ViewCount,WordCount,LinksCount,BlocksCount,TitleWordsCount,AnswerCount,CommentCount,bounty,tagsAggregate,isAnswered
6,261,16799,91,1,3,12,6,0,False,1434581,True
9,1743,500536,15,0,1,8,63,7,True,1587941,True
11,1382,139190,21,0,1,5,35,3,False,1333380,True
13,552,160858,32,0,1,4,24,8,False,820908,False
14,381,111934,9,2,2,5,10,4,True,308305,False
16,114,79712,87,0,7,11,6,0,False,1401702,True
17,168,67267,9,1,0,4,9,3,True,694363,True
17,168,67267,9,1,0,4,9,3,True,694363,True
19,288,43495,167,0,13,11,23,16,False,216694,True
24,147,64775,21,0,2,7,6,0,False,707280,True


In [15]:
posts.dtypes

Score              int64
ViewCount          int64
WordCount          int64
LinksCount         int64
BlocksCount        int64
TitleWordsCount    int64
AnswerCount        int64
CommentCount       int64
bounty              bool
tagsAggregate      int64
isAnswered          bool
dtype: object

In [16]:
!rm -rf Complete-Posts
!mkdir Complete-Posts
!rm -rf Complete-Posts-Compressed
!mkdir Complete-Posts-Compressed

In [17]:
%%time
# store data to csv file, to prevent joining all the time
posts.to_csv("Complete-Posts/*.csv",index=False)

CPU times: user 18min, sys: 14.5 s, total: 18min 14s
Wall time: 17min 36s


['Complete-Posts/00.csv',
 'Complete-Posts/01.csv',
 'Complete-Posts/02.csv',
 'Complete-Posts/03.csv',
 'Complete-Posts/04.csv',
 'Complete-Posts/05.csv',
 'Complete-Posts/06.csv',
 'Complete-Posts/07.csv',
 'Complete-Posts/08.csv',
 'Complete-Posts/09.csv',
 'Complete-Posts/10.csv',
 'Complete-Posts/11.csv',
 'Complete-Posts/12.csv',
 'Complete-Posts/13.csv',
 'Complete-Posts/14.csv',
 'Complete-Posts/15.csv',
 'Complete-Posts/16.csv',
 'Complete-Posts/17.csv',
 'Complete-Posts/18.csv',
 'Complete-Posts/19.csv',
 'Complete-Posts/20.csv',
 'Complete-Posts/21.csv',
 'Complete-Posts/22.csv',
 'Complete-Posts/23.csv',
 'Complete-Posts/24.csv',
 'Complete-Posts/25.csv',
 'Complete-Posts/26.csv',
 'Complete-Posts/27.csv',
 'Complete-Posts/28.csv',
 'Complete-Posts/29.csv',
 'Complete-Posts/30.csv',
 'Complete-Posts/31.csv',
 'Complete-Posts/32.csv',
 'Complete-Posts/33.csv',
 'Complete-Posts/34.csv',
 'Complete-Posts/35.csv',
 'Complete-Posts/36.csv',
 'Complete-Posts/37.csv',
 'Complete-P

In [18]:
!ls Complete-Posts/*csv

Complete-Posts/00.csv  Complete-Posts/20.csv  Complete-Posts/40.csv
Complete-Posts/01.csv  Complete-Posts/21.csv  Complete-Posts/41.csv
Complete-Posts/02.csv  Complete-Posts/22.csv  Complete-Posts/42.csv
Complete-Posts/03.csv  Complete-Posts/23.csv  Complete-Posts/43.csv
Complete-Posts/04.csv  Complete-Posts/24.csv  Complete-Posts/44.csv
Complete-Posts/05.csv  Complete-Posts/25.csv  Complete-Posts/45.csv
Complete-Posts/06.csv  Complete-Posts/26.csv  Complete-Posts/46.csv
Complete-Posts/07.csv  Complete-Posts/27.csv  Complete-Posts/47.csv
Complete-Posts/08.csv  Complete-Posts/28.csv  Complete-Posts/48.csv
Complete-Posts/09.csv  Complete-Posts/29.csv  Complete-Posts/49.csv
Complete-Posts/10.csv  Complete-Posts/30.csv  Complete-Posts/50.csv
Complete-Posts/11.csv  Complete-Posts/31.csv  Complete-Posts/51.csv
Complete-Posts/12.csv  Complete-Posts/32.csv  Complete-Posts/52.csv
Complete-Posts/13.csv  Complete-Posts/33.csv  Complete-Posts/53.csv
Complete-Posts/14.csv  Complete-Po

In [19]:
%time
# same as above but compressed
posts.to_csv("Complete-Posts-Compressed/*.csv.tar.xz",compression="xz",index=False)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.63 µs


['Complete-Posts-Compressed/00.csv.tar.xz',
 'Complete-Posts-Compressed/01.csv.tar.xz',
 'Complete-Posts-Compressed/02.csv.tar.xz',
 'Complete-Posts-Compressed/03.csv.tar.xz',
 'Complete-Posts-Compressed/04.csv.tar.xz',
 'Complete-Posts-Compressed/05.csv.tar.xz',
 'Complete-Posts-Compressed/06.csv.tar.xz',
 'Complete-Posts-Compressed/07.csv.tar.xz',
 'Complete-Posts-Compressed/08.csv.tar.xz',
 'Complete-Posts-Compressed/09.csv.tar.xz',
 'Complete-Posts-Compressed/10.csv.tar.xz',
 'Complete-Posts-Compressed/11.csv.tar.xz',
 'Complete-Posts-Compressed/12.csv.tar.xz',
 'Complete-Posts-Compressed/13.csv.tar.xz',
 'Complete-Posts-Compressed/14.csv.tar.xz',
 'Complete-Posts-Compressed/15.csv.tar.xz',
 'Complete-Posts-Compressed/16.csv.tar.xz',
 'Complete-Posts-Compressed/17.csv.tar.xz',
 'Complete-Posts-Compressed/18.csv.tar.xz',
 'Complete-Posts-Compressed/19.csv.tar.xz',
 'Complete-Posts-Compressed/20.csv.tar.xz',
 'Complete-Posts-Compressed/21.csv.tar.xz',
 'Complete-Posts-Compressed/22.c

In [20]:
!ls Complete-Posts-Compressed/*tar.xz

Complete-Posts-Compressed/00.csv.tar.xz
Complete-Posts-Compressed/01.csv.tar.xz
Complete-Posts-Compressed/02.csv.tar.xz
Complete-Posts-Compressed/03.csv.tar.xz
Complete-Posts-Compressed/04.csv.tar.xz
Complete-Posts-Compressed/05.csv.tar.xz
Complete-Posts-Compressed/06.csv.tar.xz
Complete-Posts-Compressed/07.csv.tar.xz
Complete-Posts-Compressed/08.csv.tar.xz
Complete-Posts-Compressed/09.csv.tar.xz
Complete-Posts-Compressed/10.csv.tar.xz
Complete-Posts-Compressed/11.csv.tar.xz
Complete-Posts-Compressed/12.csv.tar.xz
Complete-Posts-Compressed/13.csv.tar.xz
Complete-Posts-Compressed/14.csv.tar.xz
Complete-Posts-Compressed/15.csv.tar.xz
Complete-Posts-Compressed/16.csv.tar.xz
Complete-Posts-Compressed/17.csv.tar.xz
Complete-Posts-Compressed/18.csv.tar.xz
Complete-Posts-Compressed/19.csv.tar.xz
Complete-Posts-Compressed/20.csv.tar.xz
Complete-Posts-Compressed/21.csv.tar.xz
Complete-Posts-Compressed/22.csv.tar.xz
Complete-Posts-Compressed/23.csv.tar.xz
Complete-Posts-C

In [25]:
posts.astype(float).corr().compute()

  baseCov = np.cov(mat.T)
  c *= 1. / np.float64(fact)
  c *= 1. / np.float64(fact)


Unnamed: 0,Score,ViewCount,WordCount,LinksCount,BlocksCount,TitleWordsCount,AnswerCount,CommentCount,bounty,tagsAggregate,isAnswered
Score,1.0,0.577167,0.001391,0.011142,0.004497,-0.000711,0.227273,0.013444,0.039613,-0.004158,0.032817
ViewCount,0.577167,1.0,-0.018125,-0.002054,-0.00609,-0.006467,0.370084,0.004347,0.01805,0.004011,0.058519
WordCount,0.001391,-0.018125,1.0,0.208272,0.260046,0.083224,-0.035118,0.072458,0.084041,-0.000527,-0.011782
LinksCount,0.011142,-0.002054,0.208272,1.0,0.070794,0.014888,-0.033123,0.020353,0.074357,0.009334,-0.007131
BlocksCount,0.004497,-0.00609,0.260046,0.070794,1.0,0.01267,-0.005095,0.10416,0.053388,0.049931,0.062507
TitleWordsCount,-0.000711,-0.006467,0.083224,0.014888,0.01267,1.0,-0.016539,-0.003193,0.007666,0.003078,-0.015539
AnswerCount,0.227273,0.370084,-0.035118,-0.033123,-0.005095,-0.016539,1.0,0.00169,0.072226,0.071079,0.255678
CommentCount,0.013444,0.004347,0.072458,0.020353,0.10416,-0.003193,0.00169,1.0,0.057297,0.121405,-0.07121
bounty,0.039613,0.01805,0.084041,0.074357,0.053388,0.007666,0.072226,0.057297,1.0,-0.009486,0.009829
tagsAggregate,-0.004158,0.004011,-0.000527,0.009334,0.049931,0.003078,0.071079,0.121405,-0.009486,1.0,0.027496
