# Microtask-1

> Produce a notebook showing (and producing) a list with the activity per quarter: number of new committers, submitters of issues, and submitters of pull/merge requests, number of items (commits, issues, pull/merge requests), number of repositories with new items (all of this per quarter) as a table and as a CSV file using plain python3 (no pandas).


I am using the same data source file which is used in the [microtask-0](https://github.com/vchrombie/chaoss-microtasks/blob/master/microtask-0/microtask-0.ipynb) i.e, [elasticsearch-py](https://github.com/elastic/elasticsearch-py) project which is located in the `data/` folder of the repository.

## Importing the neccessary modules

In [1]:
# json library is used to handle json files, here, it is the data source retrieved by the perceval module.
import json 
# to write and read csv files, to show the output in the end
import csv  

# to handle the time formats, like to determine 'created_at' of an issue or pr.
import datetime  
# dictionaries are a convenient way to store data for later retrieval by name (key).
from collections import defaultdict  

# it is used to send http requests, I used to get the year in which the project created to do the analysis, using requests and github api.
import requests 
# open source python module to pretty print a csv file. ref: https://github.com/jazzband/prettytable
from prettytable import from_csv

In [2]:
repos = ('data','data2')


# there are three types of contributions - commit, issue, and pr
# so, I created a tuple which has the contribution types
ctypes = ('commit','pull_request','issue')

## Functions to return the details of the contribution types

_Commit_ has a different json structure when compared to *issue* and *pull_request*.

In [3]:
# function to get the required details of commits
# commit has a different json structure unlike issue/pr

def details_commit(commit):
    # load the commit data into the object
    data = commit['data']
    # traverse through the json line to find the required data
    content ={
            # get the hash of the commit
            'hash': data['commit'],
            # get the author_name
            'author': data['Author'],  
            # get the date at which the commit was created
            'created_date': datetime.datetime.strptime(data['CommitDate'],"%a %b %d %H:%M:%S %Y %z")  
    }
    return content

In [4]:
# function to get the required details of issue/pull requests
# as issue/pr has the same json structure in the data source scraped by perceval
# I wrote a single function to get the either issue/pr details 

def details_ipr(item):
    # load the commit data into the object
    data = item['data']
    # traverse through the json line to find the required data
    content ={
            # get the hash of the issue/pr
            'hash': data['id'],
            # get the author_name
            'author': data['user']['login'],  
            # get the date at which the issue/pr was created
            'created_date': datetime.datetime.strptime(data['created_at'],"%Y-%m-%dT%H:%M:%SZ")  
    }
    return content 

## Dividing the data source into contribution types

In [5]:
# contents is to store the details of each contribution whether it is a commit, issue or pr.
# using a defaultdict of list so that I can store the sorted details according to the ctype as (key, value) 
repocontent = defaultdict(list)
contents = defaultdict(list)

# to filter out commit, issue, pr details from the data source and store them seperately in dict.
# loading the file into an object

for repo in repos:
    with open('../data/%s.json'%repo) as datasrc:
        for line in datasrc:
            # load the line in the json format so as to iterate to get the required results
            line = json.loads(line)
            # if it is a commit, get the details of commit
            if line['category'] == 'commit':    
                content = details_commit(line) 
            # if it is a issue, get the details of issue
            elif line['category'] == 'issue':    
                content = details_ipr(line)
            # if it is a pr, get the details of pr
            elif line['category'] == 'pull_request':    
                content = details_ipr(line) 
            # add the (key, value) to the list
            contents[line['category']].append(content)
        repocontent[repo].append(contents)

In [None]:
repodata = defaultdict(dict)

for repo in repos:
    for ctype in ctypes:
        for item in contents[ctype]:
            



In [None]:
# initializing a empty quaters list to store the quaters of the project
quarters = []

# using again a defaultdict(list) to store the activites and it's vaues as quaters.
activities = defaultdict(list)

# newcontributors as list in order to append each time a new contributor arrives
newcontributors = defaultdict(list)
# oldcontributors as set so that dupplicated can't crawl into the set
oldcontributors = defaultdict(set)

# generating the quaters from `created`  year to `present`  year which are scraped earlier
for year,quarter,start,end in quarterwise(created,present):
    # add `Qi yyyy`  format as a quater in the quaters list
    quarters.append(r"Q%d %d"%(quarter+1,year))
    # iterating through the contribution types in order to segregate their values into the dict
    for ctype in ctypes:
        # initailizing the counts to zero
        activity =  newcontributor =  0 
        # using the earlier contents dict to check in which quater the data falls
        for item in contents[ctype]:
            # checking if the date of contribtion (commit/issue/pr) created is in between start & end
            if start<=item['created_date'].replace(tzinfo=None)<=end:
                # it is counted as an activity in that quater
                activity+=1
                # checking the author if he is a previous contributor already
                if item['author'] not in oldcontributors[ctype]:
                    # if not, he is counted as a new contributor
                    newcontributor+=1
                    # and added him to the oldcontributors set
                    oldcontributors[ctype].add(item['author'])
        # total activities are counted and added to the dict as (ctype, value) in list
        activities[ctype].append(activity)
        # newcontributors, either through commit/issue/pr are added to the dict as (ctype, value) in list
        newcontributors[ctype].append(newcontributor)

## Showing the Activity

In [None]:
# to print the total activity 
print("Quaterwise Total Activity\n")
for item in dict(activities):
    # print the total activity quaterly
    print (item, dict(activities)[item])  

# small hack to produce a new line to make space. 
#just for the funcs to look symmetric while printing :P
print() 

# to print the new activity 
print("Quaterwise New Contributors Activity\n")
# iterating through the newcontributors dict 
for item in dict(newcontributors):
    # print the new activity quaterly
    print (item, dict(activities)[item])  

## Create a CSV to store the Output

In [None]:
# add headers to the csv file 
header = ['Quarter','# Commits','# PullRequests','# Issues','# NewCommitters','# NewIssueSubmitters','# NewPRSubmitters' ]
# opening a new csv to write the data into it.
with open('elasticsearch-py.csv', 'w') as file:
    # intilize the writer object
    writer = csv.writer(file)
    # wring the header first
    writer.writerow(header)
    # to map the similar index of multiple containers so that they can be added in single entity i.e, rows
    rows = zip(quarters,activities['commit'],activities['pull_request'],activities['issue'],newcontributors['commit'],newcontributors['pull_request'],newcontributors['issue'])
    # writing all the rows at a time
    writer.writerows(rows)

## Show the Output as a table

In [None]:
# to show the output in the form of a table
# load the csv file into a object
with open("elasticsearch-py.csv", "r") as csvfile: 
    # using from_csv method from prettytable module
    csvtable = from_csv(csvfile)
    
# print the prettified table
print(csvtable)