In [1]:
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pdpipe as pdp
import pandas as pd
import numpy as np

import string
import re

# Loading Data

In [2]:
try:
    issues = pd.read_csv('../data/original/issues.csv')
    transitions = pd.read_csv('../data/original/transitions.csv')
    daycounts = pd.read_csv('../data/original/daycounts.csv')
except FileNotFoundError as e:
    print(e.strerror)

In [3]:
# check if all columns are present 
if (len(list(issues)) != 17) & (len(list(transitions)) != 24) & (len(list(daycounts)) != 3):
    print('Some columns might be missing')

# Extracting values per dataset

## Issues dataset

### Creating Functions

In [4]:
def add_datepart(df, column_name, drop=True):
    """
    Helper function that adds columns relevant to a date
    
    Args:
        df (dataframe): Dataframe in which the column is present.
        column_name (string): Name of the column you want to change.
        drop (boolean): To keep or to drop "old" column
        time (boolean): Include hours and minutes as well

    Returns:
        columns: The newly created columns are added to the dataframe.
    
    """

    fld = df[column_name]
    fld_dtype = fld.dtype
    
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): # change dtype if necessary
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[column_name] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', column_name)
    
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 
            'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    for column in attr: #
        df[targ_pre + column] = getattr(fld.dt, column.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: 
        df.drop(column_name, axis=1, inplace=True)
        
        
def from_days_to_hours(dataframe, column):
    """
    Change the number of days to hours
    
    Parameters:
    dataframe (dataframe): name of dataframe that contains the column to change
    column (string): name of the column to change
    
    Return:
    difference_in_hours (list): list containing the number of hours it took
                                for a key to be resolved
    """
    
    difference_in_hours = []
    
    for i in dataframe[column]:
        days, seconds = i.days, i.seconds
        difference_in_hours.append(days * 24 + seconds // 3600)
    return difference_in_hours

### Data Transformation

In [5]:
# data transformation
issues['created'] = pd.to_datetime(issues['created']) # from object to datetime
issues['resolutiondate'] = pd.to_datetime(issues['resolutiondate']) # from object to datetime

# data creation
issues['time_needed'] = issues['resolutiondate'] - issues['created'] # time needed to resolve
issues['days_needed'] = issues['time_needed'].dt.days # days needed to resolve

### Feature Creation

In [6]:
issues['hours_needed'] = issues['resolutiondate'] - issues['created']
issues['hours_needed'] = from_days_to_hours(issues, 'hours_needed')

In [7]:
issues['created_date'] = issues['created'] # create will be removed after calling add_datepart
add_datepart(issues, 'created') # extract external information from date

## Transitions dataset

### Creating functions for prediction

In [8]:
def create_dataset(column):
    """
    Create the dataset that will later be used to train a decision tree
    
    Parameter:
    column (string): will function as a filter
    
    Return:
    dataset (dataframe): dataframe that will later be used to train a model
    
    """
    
    dataset = transitions_finished.loc[transitions_finished['from_status'] == column]
    
    dataset_name = dataset[['key', 'days_since_open', 'from_status', 'to_status', 
                            'days_in_from_status', 'who', 'description_length', 
                            'comment_count', 'issue_type', 'vote_count',
                            'watch_count', 'priority']]
    
    return dataset_name


def create_dt(dataframe):
    """
    Create a decision tree algorithm for a specific problem
    
    Parameter:
    dataframe (dataframe): dataframe used to train the model
    
    Return:
    name_decisiontree (algorithm): trained algorithm for a specific problem
    """
    
    # Feautres for prediction
    X = dataframe[['who', 'description_length', 'comment_count', 
                   'issue_type', 'vote_count', 'watch_count',
                   'priority','watch_count']].to_numpy() 
    # Target variable
    y = dataframe['to_status'].to_numpy() 
    
    # 80% training and 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    name_decisiontree = DecisionTreeClassifier()
    name_decisiontree = name_decisiontree.fit(X_train,y_train)
    return name_decisiontree


def make_prediction(dataframe, last_status):
    """
    Make a prediction given a dataset and the latest prediction
    
    Parameter:
    datafame (dataframe): daframe that will be used to make a predcition
    last_status (string): the last status given in transition
    
    Return:
    The prediciton made on a specific problem
    
    """
    if last_status == 'Open':
        return open_dt.predict(dataframe)
    if last_status == 'Patch Available':
        #return patch_available_dt.predict(dataframe)
        return ['Resolved']
    if last_status == 'In Progress':
        return in_progress_dt.predict(dataframe)
    else:
        return ['Resolved']

In [9]:
# cleaning data for machine learning algorithm
transitions['who_old'] = transitions['who']
lb_make = LabelEncoder()
transitions["who"] = lb_make.fit_transform(transitions["who"])
transitions["issue_type"] = lb_make.fit_transform(transitions["issue_type"])
transitions["priority"] = lb_make.fit_transform(transitions["priority"])

# scale data 
pipeline = pdp.Scale('RobustScaler', ['description_length', 'comment_count', 'vote_count', 'watch_count'])
transitions = pipeline(transitions)

# only select data on which algorithm can train
keys = list(issues.dropna(subset=['resolutiondate'])['key']) # only select finished keys
transitions_finished = transitions.loc[transitions['key'].isin(keys)] # select dataframe
transitions_finished = transitions_finished.reset_index() # reset index
status = list(transitions_finished['from_status'].unique())

In [10]:
# create status open dataset
open_dataset = create_dataset('Open')
open_dataset = open_dataset.fillna(0) # description length only na value

# create status patch available dataset
patch_available_dataset = create_dataset('Patch Available')
patch_available_dataset = patch_available_dataset.fillna(0)# description length only na value

# create status open dataset
in_progress_dataset = create_dataset('In Progress')

# create decision tree per dataset  
open_dt = create_dt(open_dataset)
patch_available_dt = create_dt(patch_available_dataset)
in_progress_dt = create_dt(in_progress_dataset)

# Make prediction

In [11]:
# select those keys that still need to be resolved
transitions_progress = transitions_progress.loc[~transitions_progress['key'].isin(keys)]
# list of keys that still need to be resolved
keys_to_process = list(transitions_progress['key'].unique())
# select relevant columns
transitions_progress = transitions_progress[['key', 'days_since_open', 'from_status', 
                                                       'to_status', 'days_in_from_status', 'who', 'description_length', 'comment_count',
                                                       'issue_type', 'vote_count', 'watch_count', 'priority']]
transitions_progress['description_length'] = transitions_progress['description_length'].fillna(0)

# store all predictions
predicted_actions = []


for key in keys_to_process:
    actions = []
    df = transitions_progress.loc[transitions_progress['key'] == key] 

    last_status = list(df['to_status'])[-1] # get latest status and go from there
    last_row = df.iloc[[-1]] # use latest info 
    last_row = last_row[['who', 'description_length', 'comment_count', 
                         'issue_type', 'vote_count', 'watch_count', 
                         'priority', 'watch_count']] # Features

    first_last_status = list(df['to_status'])[-1] # get latest status and go from there
    i = 0
    while i == 0:
        last_status = make_prediction(last_row, last_status)
        if first_last_status != last_status[0]:
            if last_status[0] == 'Resolved':
                actions.append(last_status[0])
                i = 1
            else:
                actions.append(last_status[0])
                i = 0
        else:
            actions.append('Resolved')
            i = 1
    predicted_actions.append(actions)
    
# create new dataframe for predicted values
df_to_add = {'key' : keys_to_process, 'Predicted_actions' : predicted_actions} 
df_to_add = pd.DataFrame(df_to_add)

# merge prediction dataframe with original issues dataframe
issues = issues.merge(df_to_add, how = 'left', on = 'key')
issues['Predicted_actions'] = issues['Predicted_actions'].fillna(0)


### Create funtions for feature creation

In [12]:
def change_datetype(dataset, column_name, hours = True):
    """
    Changes the type of a column from object to datetime
    
    Parameters:
        dataset (dataframe): Datasets that contain the column
        column_name (str): The column which you want to change.
        hours (boolean): Choosing specifc column
        
    Returns:
        column (list): The column which is of type datetime. 
    """
    column = pd.to_datetime(dataset[column_name])
    if hours:
        column = [time.strftime('%H:%m:%S %d-%m-%Y') for time in column]
    else:
        column = [time.strftime('%d-%m-%Y') for time in column]
    column = pd.to_datetime(column)
    
    return column

In [13]:
def get_event_info():
    """
    Event info contains information about the number of steps taken and the specifc steps
 
    Returns:
    list_steps (list): contains the number of steps taken.
    list_steps_taken (list): contains the number of steps taken.
    list_status (list): contains information about the steps taken used to be transformed later
    complete_time (list): contains the time that was needed per status
    """
        
    list_steps = []
    list_steps_taken = [] 
    list_status = [] 
    complete_time = []

    
    for key in issues['key']:
        df = transitions.loc[transitions['key'] == key] # select relevant data
        df = df.loc[df['to_status'] != 'Closed'] # remove where to_status equals 'Closed'
        
        time = list(df['days_in_from_status'])
        status_key = df['from_status'].append(df['to_status']) # get all status in data
        status_key = pd.Series(status_key).drop_duplicates().tolist() # remove duplicates, keep order
        
        list_steps.append(len(status_key))
        list_steps_taken.append('-'.join(status_key))
        list_status.append(status_key)
        complete_time.append(time)
        
    return list_steps, list_steps_taken, list_status, complete_time

In [14]:
def create_dictionary():
    """
    Creates a dictionary of the status keys
        
    Returns:
    status_keys_dict (dict): dictionary containing status plus num and alpha translation.
    """
    alphabet = string.ascii_lowercase # import the alphabet
    unique_status_key = list(set(transitions['from_status'].append(transitions['to_status'])))
    
    status_keys_dict = tuple(enumerate(unique_status_key)) # create a dictionary from unique status values
    status_keys_dict = dict((status, num) for num, status in status_keys_dict) # numer dict
    status_keys_dict = dict((status, [alphabet[status_keys_dict[status]], 
                                      status_keys_dict[status]]) for status in status_keys_dict) #alpha dict
    
    return status_keys_dict

In [15]:
def path_taken(dictionary, status_event_value):
    """
    Given a dictionary it will create a string of the path taken
        
    Returns:
    path_taken_total (list): containing information about the path taken.
    """
    path_taken_total = []
    
    for events in status_event:
        path_taken = [str(dictionary[event][0]) for event in events]
        path_taken_total.append(path_taken)
    
    return path_taken_total

In [16]:
def get_time_per_status(dictionary):
    """
    Extract the time it took to go from one status to another
    
    Parameters:
    dictionary (dict): dictionary containing the translation of status
    
    Returns:
    time_steps_list (list): List containing the time it took per step
    """

    time_steps_list = []

    for key in issues['key']:
        df = transitions.loc[transitions['key'] == key] # select relevant data
        df = df.loc[df['to_status'] != 'Closed'] # remove where to_status equals 'Closed'

        df['days_in_from_status'] = df['days_in_from_status'].fillna(0)
        steps_transition = [value.split(" to ") for value in df['transition']]
        list_of_steps = [[step for step in steps] for steps in steps_transition]
        list_of_steps = ['_'.join(steps) for steps in list_of_steps]
        status_list = list(df['days_in_from_status'])
        time_steps_list.append({list_of_steps[value] : status_list[value] for value in range(len(df))})
    
    return time_steps_list

In [17]:
def create_avg_time():
    """
    Returns the avg time it took per step
    
    avg_transition (dataframe): containing the time it took to finish a task
                                time will be represented in avg and std
    """
    avg = []
    for prio in priority:
        for issue in issue_type:
            for value in transition:
                test = df_plus_steps.loc[df[value] != 0]
                test = test.loc[df_plus_steps['priority'] == prio]
                avg.append([prio,issue, value, test[value].mean(), test[value].std()])
    avg_transition = pd.DataFrame(avg, columns = ['priority', 'issue_type', 'transition', 'avg', 'std'])
    
    return avg_transition.fillna(0)

## who worked on the assignment

In [18]:

issue_key = list(issues['key'].unique())

most_assigned = {'cutting' : 1,
                'thiru_mg': 2,
                'tomwhite': 3,
                'hammer': 4,
                'massie': 5,
                'rdblue': 6,
                'dcreager': 7,
                'scott_carey': 8,
                'philip': 9,
                'busbey': 10, 
                'sbanacho': 11,
                'nielsbasjes': 12,
                'sacharya': 13}

working_people = list(most_assigned.keys())

In [20]:
issue_workes = []
for key in issue_key:
    df = transitions.loc[transitions['key'] == key]
    who_list = list(set(df['who_old']))
    
    people_value = []
    for i in who_list:
        if i in working_people:
            value = most_assigned[i]
            people_value.append(str(value)) # if in top 13 
        else:
            people_value.append(str(14)) # if not in top 13
    issue_workes.append(people_value)


In [21]:
worked_on_project = pd.DataFrame(issue_workes, columns = ['worker_1', 'worker_2', 'worker_3', 'worker_4'] ).fillna(0)
issues = pd.concat([issues, worked_on_project], axis=1, sort=False)


#### Data Transformation

In [22]:
# changing datetypes
transitions['when'] = change_datetype(transitions, 'when')
transitions['updated'] = change_datetype(transitions, 'updated')
transitions['created'] = change_datetype(transitions, 'created')

#### Feature Generation

In [23]:
# fill na values in from_status with Non-existent
transitions['from_status'] = transitions['from_status'].fillna('Non-existent')
steps, steps_taken, status_event, time = get_event_info() # get information about each event

issues['number_of_steps'] = steps # adding steps to issues dataframe
issues['steps_taken'] = steps_taken  # adding steps taken to issues dataframe

time_needed_df = pd.DataFrame.from_records(time).fillna(0) # put each time slot in a individual column
issues = pd.concat([issues, time_needed_df], axis=1, sort=False) # add time slot data to issues


In [24]:
# combining existing status events with the predicted status event, making it a complete event
issues['status_event'] = status_event

combination_list = []

for key in range(len(issues)):
    if issues['Predicted_actions'][key] != 0:
        # if status is not yet complete add prediction
        steps_already_taken = issues['status_event'][key]
        steps_predicted = issues['Predicted_actions'][key]
        complete_status_event = steps_already_taken + steps_predicted
        combination_list.append(complete_status_event)
    else:
        # if already complete use that status
        combination_list.append(issues['status_event'][key])
        
        
# add combination to dataframe
issues['steps_taken_combined'] = combination_list

In [25]:
# create a dictionary of the steps that can be taken
keys_dictionary = create_dictionary()

# make a list of the steps that have been taken
path_taken_list = path_taken(keys_dictionary, list(issues['steps_taken_combined']))
df_01 = pd.DataFrame(list(issues['steps_taken_combined']),
                     columns = ['step0', 'step1','step2','step3','step4','step5', 'step6'])

# combine the two dataset (issues and path_taken) together 
result = pd.concat([issues, df_01], axis=1, sort=False) # adding df to issues dataset
result['number_of_steps'] = [len(i) for i in result['steps_taken_combined']]

In [26]:
one_hot = MultiLabelBinarizer() # initialize MultiLabelBinarizer object
test = one_hot.fit_transform(list(issues['steps_taken_combined'])) # one-hot encode data
df = pd.DataFrame(test, columns=one_hot.classes_) # change column names to relevant columns

df_plus_status = pd.concat([result, df], axis=1, sort=False) # adding df to issues dataset

In [27]:
dictionary_status = create_dictionary()
time_taken_per_step = get_time_per_status(dictionary_status)
df = pd.DataFrame(time_taken_per_step) 
df = df.fillna(0)
df_plus_steps = pd.concat([df_plus_status, df], axis=1, sort=False) # adding df to issues dataset

In [28]:
priority = list(result['priority'].unique()) # unique values for priority
issue_type = list(result['issue_type'].unique()) # unique values for issue_type

# all possible combinations in transition
transition = ['Open_Patch Available','Patch Available_In Progress', 'Open_Resolved',
              'Patch Available_Resolved', 'Resolved_Reopened', 'Reopened_Resolved', 
              'In Progress_Patch Available', 'Patch Available_Open', 'Reopened_Patch Available', 
              'Open_In Progress', 'In Progress_Resolved', 'In Progress_Open']


# get avg time needed per transition
avg_transition = create_avg_time()

# create empty dataframes
dataset_avg = pd.DataFrame()
dataset_std = pd.DataFrame()

# create dataframe corresponding with the issue_type and priority of an issue
for i in range(len(df)):
    to_append_avg = avg_transition.loc[(avg_transition['priority'] == df_plus_steps['priority'][i]) & 
                               (avg_transition['issue_type'] == df_plus_steps['issue_type'][i])][['avg', 'transition']]
    to_append_avg = to_append_avg.set_index('transition').T
    
    to_append_std = avg_transition.loc[(avg_transition['priority'] == df_plus_steps['priority'][i]) & 
                               (avg_transition['issue_type'] == df_plus_steps['issue_type'][i])][['avg', 'transition']]
    to_append_std = to_append_std.set_index('transition').T
    
    dataset_avg = dataset_avg.append(to_append_avg)
    dataset_std = dataset_std.append(to_append_std)
    

In [29]:
dataset_std = dataset_std.reset_index() # reset index
dataset_avg = dataset_avg.reset_index() # reset index

df_plus_avg = pd.concat([df_plus_steps, dataset_avg], axis=1, sort=False) # adding df to issues dataset
df_plus_std = pd.concat([df_plus_avg, dataset_std], axis=1, sort=False) # adding df to issues dataset


## Daycounts dataset

#### Feature Creation

In [30]:
# chagne date type
daycounts['day'] = change_datetype(daycounts, 'day')
issues['created_date_T'] = change_datetype(issues, 'created_date')

In [31]:
# get the number of projects that are in progress at time of creation new issue
# only look at the project inprogress or patch_available as well?
current_projects = []

for issue in range(len(issues)):
    try:
        start = issues['created_date_T'][issue]        
        start_df = daycounts.loc[(daycounts['day'] == start)]

        # get number of projects currently "in progress"
        projects_in_progress = list(start_df.loc[start_df['status'] == 'In Progress']['count'])[0]
        # add value to list
        current_projects.append(projects_in_progress)
        
    except:
        # if date isn't available add 0 project in progress
        current_projects.append(0)


In [32]:
df_plus_std['projects_in_progress'] = current_projects # projects in progress at start of issue
df_plus_std['steps_taken_combined'] = ['_'.join(i) for i in df_plus_std['steps_taken_combined']]

# Fill NaN Values

In [33]:
# it is likely that the descirptions with the NA value has no comments, therefore 0
df_plus_std['description_length'] = df_plus_std['description_length'].fillna(0)

# Store final datset

In [34]:
df_plus_std.to_csv('../data/transformed/pipeline_data.csv')