In [1]:
import pandas as pd
import pdpipe as pdp
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

# Load data

In [2]:
# load time data
df_for_prediction = pd.read_csv("../data/transformed/pipeline_data.csv")

# due to time aspect
df_for_prediction = df_for_prediction.sort_values(by = ['createdYear', 'createdMonth', 'createdDay'],
                                                  ascending = True)

# Creating Pipeline

In [3]:
# columns to drop
columns_to_drop = ['Unnamed: 0', 'project', 'status', 'updated', 'key', 'assignee', 'resolution',
                   'days_in_current_status', 'reporter', 'steps_taken','time_needed','created_date',
                   'Non-existent_Open', '0','1','2','3','4','5','6','7','8','9','10', 'days_needed',
                   'Predicted_actions', 'status_event', 'Open_Patch Available','Patch Available_In Progress',
                   'Open_Resolved','Patch Available_Resolved', 'Resolved_Reopened', 'Reopened_Resolved',
                   'In Progress_Patch Available', 'Patch Available_Open','Reopened_Patch Available',
                   'Open_In Progress', 'In Progress_Resolved', 'index.1', 'createdElapsed', 'step0']


# columns that will be scaled with robust scaler
columns_scaling = ['comment_count', 'description_length', 'summary_length', 'watch_count']
# steps taken
columns_to_change = ['step1', 'step2', 'step3', 'step4', 'step5', 'step6']
# one hot encoding columns
columns_onehotencoding = ['steps_taken_combined']

In [4]:
pipeline = pdp.ColDrop(columns_to_drop) # drop columns
pipeline += pdp.Scale('RobustScaler', columns_scaling) # scale data -> robust scaler, handles outliers
#pipeline += pdp.Encode(['assignee', 'steps_taken'])
pipeline += pdp.MapColVals('priority', {'Minor': 0, 
                                       'Trival': 1, 
                                       'Blocker': 2,
                                       'Major': 3,
                                       'Critical': 4})
pipeline += pdp.MapColVals('issue_type',{'Test': 0,
                                        'Bug': 1,
                                        'Improvement':2,
                                        'Task':3,
                                        'Wish':4,
                                        'New Feature':5,
                                        'Sub-task':6})


pipeline += pdp.MapColVals(columns_to_change, {'Non-existent': 0,
                                               'Open': 1,
                                               'Resolved': 2,
                                               'Patch Available': 3,
                                               'Reopened': 4,
                                               'In Progress': 5,
                                               'Reopened': 6,
                                                '0': 0,
                                              })


pipeline += pdp.OneHotEncode(columns_onehotencoding)

# Use Pipeline
Pass training and test data seperately through pipeline, so no data leakage happens

In [6]:
#df_to_train = df_to_train.fillna(0) 
df_for_prediction = df_for_prediction.fillna(0) # test data
df_for_train = df_for_prediction[df_for_prediction['resolutiondate'] != 0] # training data

In [5]:
#df_to_train = pipeline(df_to_train) # pass data through pipeline
df_for_prediction = pipeline(df_for_prediction) # pass data through pipeline
df_for_train = pipeline(df_for_train) # pass data through pipeline

In [8]:
# save data that went through pipeline
#df_to_train.to_csv('data_for_train.csv')
df_for_prediction.to_csv('../data/transformed/prediction_pipelined_data.csv')
df_for_train.to_csv('../data/transformed/train_pipeline_data.csv')