In [2]:
!pip install -q pyarrow

# Data Importing

In [3]:
import pandas as pd
import json

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
# Load Bots JSON file
with open('../data-raw/bot_activities.json') as file:
    df_bots = pd.json_normalize(json.load(file))

In [6]:
# Load Human JSON file
with open('../data-raw/human_activities.json') as file:
    df_human = pd.json_normalize(json.load(file))

In [7]:
# Concatenate dataframes to one combined dataframe, by generating new column category = bot/human
df_bots['category'] = 'bot'
df_human['category'] = 'human'

activities = pd.concat([df_bots, df_human], ignore_index=True)

In [8]:
# Specify data type for payload.GH_push_id column (error during saving in binary format)
activities = activities.astype({'payload.GH_push_id': 'str'})

In [11]:
# Save to a binary parquet format file
activities.to_parquet('../data/activities.parquet', index=False)

# Sorting data into objects

In [16]:
# Defining columns
activity_types_columns = ['contributor', 'category', 'repository', 'activity', 'date']
release_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'release.name', 'release.description_length', 'release.created_at', 'release.prerelease', 'release.new_tag', 'release.GH_node']
page_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'page.name', 'page.title', 'page.new']
review_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'review.status', 'review.GH_node']
tag_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'gitref.type', 'gitref.name', 'gitref.description_length']
issue_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'issue.id', 'issue.title', 'issue.created_at', 'issue.status', 'issue.closed_at', 'issue.resolved']
pull_request_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'pull_request.id', 'pull_request.title', 'pull_request.created_at', 'pull_request.status', 'pull_request.closed_at', 'pull_request.merged', 'pull_request.GH_node']
comment_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'comment.length', 'comment.GH_node']
payload_columns = ['contributor', 'category', 'repository', 'activity', 'date', 'payload.pushed_commits', 'payload.pushed_distinct_commits', 'payload.GH_push_id']


In [17]:
# Save objects into binary files
activities[activity_types_columns].to_parquet('../data/activity_types.parquet', index=False)
activities[release_columns].to_parquet('../data/releases.parquet', index=False)
activities[page_columns].to_parquet('../data/pages.parquet', index=False)
activities[review_columns].to_parquet('../data/reviews.parquet', index=False)
activities[tag_columns].to_parquet('../data/tags.parquet', index=False)
activities[issue_columns].to_parquet('../data/issues.parquet', index=False)
activities[pull_request_columns].to_parquet('../data/pull_requests.parquet', index=False)
activities[comment_columns].to_parquet('../data/comments.parquet', index=False)
activities[payload_columns].to_parquet('../data/payloads.parquet', index=False)

In [19]:
pd.read_parquet('../data/issues.parquet').sample(10)

Unnamed: 0,contributor,category,repository,activity,date,issue.id,issue.title,issue.created_at,issue.status,issue.closed_at,issue.resolved
652355,gitguardian[bot],bot,SocialGouv/mda,Commenting pull request,2023-03-28T08:04:38+00:00,,,,,,
804833,795565,human,975853,Commenting issue,2022-12-25T08:14:34+00:00,1696.0,anonymised,2022-12-10T02:15:02+00:00,open,,False
951132,797989,human,697563,Pushing commits,2023-03-13T15:08:32+00:00,,,,,,
948193,767954,human,439953,Commenting pull request,2023-03-11T13:00:10+00:00,,,,,,
650392,aws-cdk-automation,bot,aws/aws-cdk,Closing pull request,2023-03-28T00:09:21+00:00,,,,,,
195118,aws-cdk-automation,bot,aws/aws-cdk,Commenting pull request,2023-01-05T00:08:47+00:00,,,,,,
645432,livingdocs-automation,bot,livingdocsIO/documentation,Pushing commits,2023-03-27T09:42:13+00:00,,,,,,
804161,959993,human,486733,Pushing commits,2022-12-24T06:21:30+00:00,,,,,,
687684,delete-merged-branch[bot],bot,owncloud-ops/limesurvey,Deleting branch,2023-04-03T12:26:23+00:00,,,,,,
32274,gitguardian[bot],bot,workshopapps/customersupport.web,Commenting pull request,2022-12-01T13:17:49+00:00,,,,,,
