In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from rake_nltk import Rake

### Preprocessing

Import data as data frame

In [2]:
problemfile = 'problems_2019-03-21.xlsx'
df = pd.read_excel(problemfile)
df

Unnamed: 0,Problem Manager,Number,Active,Activity due,Additional assignee list,Approval,Approval history,Approval set,Assigned to,Assignment group,...,Work notes,Work notes list,Workaround,Details,IT WAR Walk On,Quality Improvement Project,Director,Follow Up,Send Email,VP
0,Naveen Kumar,PRB0062159,True,,,Not Yet Requested,,,,FSC_ITL3O2STIBCO,...,,,,,NaT,NaT,,,,
1,Mark Duncan,PRB0060146,False,,,Not Yet Requested,,,V Sox,FXF_SPT_US_FIELDLINEHAUL,...,2018-08-22 05:40:24 - Eli Smith (Work notes)\n...,,2018-06-28 19:56:05 - D'Zundra Green (Workarou...,,NaT,NaT,,,,
2,Naveen Kumar,PRB0060758,False,,,Not Yet Requested,,,,FSC_ITL3O2STIBCO,...,,,,,NaT,NaT,,,,
3,Christophe Gurley,PRB0060985,False,,,Not Yet Requested,,,Scott Dubak,FXO_SA_US_WindowsServer,...,2018-08-12 14:49:21 - Christophe Gurley (Work ...,,2018-07-15 18:14:43 - Ashish Bisht (Workaround...,,NaT,NaT,,,,
4,Michael Kennemer,PRB0060147,False,,,Not Yet Requested,,,Robert Bumpus,FXS_FIREWALL,...,2018-07-27 15:09:39 - Michael Kennemer (Work n...,,2018-06-29 07:24:25 - Brad Moore (Workaround)\...,,NaT,NaT,,,,
5,Michael Kennemer,PRB0062412,True,,,Not Yet Requested,,,Mahesh Pillutla,FXF_SPT_US_FreightDataSvcs,...,,,2019-01-06 12:00:08 - Stephen Barch (Workaroun...,Connection pooling errors,NaT,NaT,Rita Moore,,,Ann Higgins
6,Christopher Barber,PRB0063553,False,,,Not Yet Requested,,,,FXS_EIS_GLB_ProblemMgmt,...,,,,,NaT,NaT,,,,
7,Terri Hamilton,PRB0060194,False,,,Not Yet Requested,,,William Redmond,FXS_SPT_US_DOTCOMSHIPPING,...,2018-09-13 09:57:53 - Michael Kennemer (Work n...,,2018-07-06 20:12:25 - Sean Green (Workaround)\...,,NaT,NaT,,,,
8,Eli Smith,PRB0061073,False,,,Not Yet Requested,,,Douglas Clinger,FXF_SPT_SEFS,...,2018-07-26 03:00:28 - Eli Smith (Work notes)\n...,,2018-07-24 03:21:28 - Eli Smith (Workaround)\n...,,NaT,NaT,,,,
9,Eli Smith,PRB0061373,False,,,Not Yet Requested,,,Matthew Schwab,FXS_SPT_GLB_SharePoint,...,2018-12-11 14:41:16 - Randall Painter (Work no...,,2018-09-15 07:26:49 - Divakar Durgapal (Workar...,,NaT,NaT,,,,


Drop features that have all NA values

Drop features that have more than 10% NA values (can modify as needed)

Drop features that have all the same value

'Active', 'Known error', 'Problem state', and 'State' were all dropped as they did not seem to add meaningful information

'Parent' was added again because we create meaningful feature from it later

In [3]:
data = df.dropna(axis=1, how='all')
data = df.dropna(axis=1, thresh=df.shape[0]*0.90)
for col in data:
    if len(data[col].value_counts()) < 2:
        data = data.drop(col, axis=1)
data = data.drop(['Active', 'Known error', 'Problem state', 'State'], axis=1)
data['Category'] = df['Category']
data['Company'] = df['Company']
data['Parent'] = df['Parent']
data.to_csv('problem_pdsm_simple.csv', index=False)

### Extract keywords from descriptions in full dataset
Adds Short Description and Keywords features to above dataset

In [4]:
def extract_keywords(text):
    r = Rake(min_length=2, max_length=8)
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[0:4]
data['Keywords - Short Desc'] = data['Short description'].apply(extract_keywords)
# data['Keywords - Desc'] = data['Description'].apply(extract_keywords)

#### Load incidents dataset

In [5]:
incidents = pd.read_excel('incidents_2019-03-21.xlsx')
incidents.head()

Unnamed: 0,Number,Problem,Affected Company,Business service,ALERTING (MIN),Awareness (Min),Acknowledge (Min),Opened,Created,Closed,...,User Location,User Network,User input,VZ Correlation ID,Vendor Exception,Vendor Incident Number,Vendor Name,Watch list,Work notes,Work notes list
0,INC010068680,PRB0060003,FedEx Express,Communication Device,,,,2018-06-17 03:59:28,2018-06-28 00:53:59,2018-06-28 00:58:10,...,,,,,,,,,,
1,INC010000197,PRB0060003,FedEx Express,Communication Device,0.0,66.0,66.0,2018-06-17 03:59:28,2018-06-17 04:04:04,2018-08-04 14:00:48,...,,,,,,,,,2018-08-02 13:32:12 - Stephen Barch (Work note...,
2,INC010003952,PRB0060019,FedEx Services,Scanner,0.0,1.0,1.0,2018-06-18 08:24:50,2018-06-18 08:41:13,2018-06-20 14:00:42,...,,,,,,,,,2018-06-18 09:23:54 - Brad Moore (Work notes)\...,
3,INC010269074,PRB0060026,FedEx Services,Database,,,,2018-06-18 15:40:15,2018-07-31 16:55:15,2018-08-04 15:00:23,...,,,,,,,,,,
4,INC010269078,PRB0060026,FedEx Services,Database,,,,2018-06-18 15:40:15,2018-07-31 16:55:19,2018-08-04 15:00:38,...,,,,,,,,,,


### Add assignment group from incidents to problems dataset

In [None]:
# assign_groups = incidents[['Problem', 'Assignment group']]
# assign_groups = assign_groups.rename(columns={'Problem':'Number'}).drop_duplicates('Number')
# # assign_groups.head()
# data = data.merge(assign_groups, how='left', on='Number')
# data.head()

### Get avg, min, and max duration of related incidents for each problem

In [6]:
durations = incidents[['Problem','Duration']]
durations = durations.rename(columns={'Problem':'Number'})
durations2 = data[['Number', 'Duration']]
durations = durations.append(durations2, sort=False)
durations = durations.sort_values(by='Number', axis=0).reset_index().drop('index',axis=1)
# durations.loc[durations.Duration_y == 0, 'Duration'] = durations['Duration_x'] 
# durations.loc[durations.Duration_y != 0, 'Duration'] = durations['Duration_y']
# durations['Duration'] = durations['Duration'].astype(int)
durations.head()

Unnamed: 0,Number,Duration
0,PRB0060003,939522
1,PRB0060003,691713
2,PRB0060003,4009074
3,PRB0060004,0
4,PRB0060006,590877


In [7]:
durations['Duration Max'] = durations.Number.map(durations.groupby(['Number'])['Duration'].max())
durations['Duration Mean'] = durations.Number.map(durations.groupby(['Number'])['Duration'].mean())
durations['Duration Min'] = durations.Number.map(durations.groupby(['Number'])['Duration'].min())
durations['Duration Range'] = durations['Duration Max'] - durations['Duration Min']
durations = durations.drop_duplicates('Number').drop('Duration', axis=1)
durations.head()

Unnamed: 0,Number,Duration Max,Duration Mean,Duration Min,Duration Range
0,PRB0060003,4009074,1880103.0,691713,3317361
3,PRB0060004,0,0.0,0,0
4,PRB0060006,590877,590877.0,590877,0
5,PRB0060013,0,0.0,0,0
6,PRB0060014,16338265,16338265.0,16338265,0


In [8]:
data = data.merge(durations, how='left', on='Number')
data = data.drop('Duration', axis=1)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,3 - Low,FedEx Supply Chain,...,19,3 - Low,Application,,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,PRB0060146,FXF_SPT_US_FIELDLINEHAUL,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3 - Low,FedEx Freight Corporate,...,32,3 - Low,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,3 - Low,FedEx Supply Chain,...,33,3 - Low,Application,,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,PRB0060985,FXO_SA_US_WindowsServer,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,3 - Low,FedEx Office,...,24,3 - Low,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,PRB0060147,FXS_FIREWALL,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",3 - Low,FedEx Office,...,27,3 - Low,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


### Find similar keywords within related incidents

## Clean the data

Find out default types for the columns

In [None]:
data.dtypes

### Number column

Remove the PRB prefix from the Number column and convert it to a number

In [9]:
data['Number'] = data['Number'].map(lambda x: x.lstrip('PRB'))
data['Number'] = pd.to_numeric(data['Number'])
data

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,3 - Low,FedEx Supply Chain,...,19,3 - Low,Application,,,"[validation error occurred, alert message comi...",622772,3.978362e+04,0,622772
1,Mark Duncan,60146,FXF_SPT_US_FIELDLINEHAUL,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3 - Low,FedEx Freight Corporate,...,32,3 - Low,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,5.459470e+05,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,3 - Low,FedEx Supply Chain,...,33,3 - Low,Application,,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,7.267273e+02,0,2390
3,Christophe Gurley,60985,FXO_SA_US_WindowsServer,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,3 - Low,FedEx Office,...,24,3 - Low,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,4.355126e+05,3373,2403505
4,Michael Kennemer,60147,FXS_FIREWALL,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",3 - Low,FedEx Office,...,27,3 - Low,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,5.056806e+05,0,2197479
5,Michael Kennemer,62412,FXF_SPT_US_FreightDataSvcs,0,Server,2019-01-06 12:00:08,973921,Freight Rating\nMax Severity SEV3 / P5\nCurren...,2 - Medium,FedEx Freight Corporate,...,52,1 - High,Infrastructure,FedEx Services,INC011069742,"[freight rating experienced time outs due, dat...",555479,1.513453e+05,0,555479
6,Christopher Barber,63553,FXS_EIS_GLB_ProblemMgmt,0,Server,2019-03-11 14:45:26,5305519,CAD issue being reported in Lakeland location....,2 - Medium,FedEx Services,...,4,2 - Medium,Hardware,FedEx Services,INC011366875,[local site issue],239702,4.755157e+04,0,239702
7,Terri Hamilton,60194,FXS_SPT_US_DOTCOMSHIPPING,1382400,Server,2018-07-06 20:12:25,828618,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n07/06/18 13:54...,1 - High,"FedEx Services, FedEx Express",...,35,2 - Medium,Application,FedEx Services,INC010122859,[com customers experienced long response times...,5924728,1.220817e+06,22229,5902499
8,Eli Smith,61073,FXF_SPT_SEFS,69296,Server,2018-07-23 15:45:04,5305519,Increasing amount of pending data on one SEFS...,3 - Low,FedEx Freight Corporate,...,13,3 - Low,Application,FedEx Services,INC010218354,"[pending data due, freight jms queue, large am...",213324,5.509300e+04,528,212796
9,Eli Smith,61373,FXS_SPT_GLB_SharePoint,0,Server,2018-09-15 07:26:49,5163721,Purple Hub page throwing HTTP500 error\n\nInte...,3 - Low,FedEx Services,...,12,3 - Low,Application,FedEx Services,INC010514644,"[purple hub due, issues accessing, internal us...",72743,1.632500e+04,718,72025


In [None]:
data.dtypes

### Priority, Impact, and Urgency

Take the first character (number) from each and convert to numeric

In [10]:
data['Priority'] = data['Priority'].map(lambda x: x[0])
data['Priority'] = pd.to_numeric(data['Priority'])

data['Impact'] = data['Impact'].map(lambda x: x[0])
data['Impact'] = pd.to_numeric(data['Impact'])

data['Urgency'] = data['Urgency'].map(lambda x: x[0])
data['Urgency'] = pd.to_numeric(data['Urgency'])

data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,3,FedEx Supply Chain,...,19,3,Application,,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,60146,FXF_SPT_US_FIELDLINEHAUL,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,FedEx Freight Corporate,...,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,3,FedEx Supply Chain,...,33,3,Application,,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,60985,FXO_SA_US_WindowsServer,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,3,FedEx Office,...,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,60147,FXS_FIREWALL,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",3,FedEx Office,...,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


In [None]:
data.dtypes

### Categorical Fields

Check out category counts

In [None]:
data['Assignment group'].value_counts()

In [None]:
data['Business service'].value_counts()

In [None]:
# data['Category'].value_counts()

In [None]:
# data['Company'].value_counts()

In [None]:
# data['Problem Manager'].value_counts().nlargest(9)

In [None]:
# data['Type'].value_counts()

Cut categories with low counts and add together to make an "Other" category

In [11]:
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<25, 'Other') if x.name=='Business service' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<11, 'Other') if x.name=='Company' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<40, 'Other') if x.name=='Problem Manager' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<11, 'Other') if x.name=='Type' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<20, 'Other') if x.name=='Assignment group' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<11, 'Other') if x.name=='Opened by' else x)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,3,FedEx Supply Chain,...,19,3,Application,,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,FedEx Freight Corporate,...,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,3,FedEx Supply Chain,...,33,3,Application,,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,3,FedEx Office,...,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",3,FedEx Office,...,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


Fill NAs with 'Other'

In [12]:
data['Assignment group'] = data['Assignment group'].fillna('Other')
data['Business service'] = data['Business service'].fillna('Other')
data['Category'] = data['Category'].fillna('Other')
data['Company'] = data['Company'].fillna('Other')
data['Opened by'] = data['Opened by'].fillna('Other')
data['Problem Manager'] = data['Problem Manager'].fillna('Other')
data['Type'] = data['Type'].fillna('Other')

In [13]:
data['Problem Manager'].value_counts()

Other                  454
Christopher Barber     201
Eli Smith              126
Naveen Kumar           115
Daniel Smith           110
Matthew Gonderinger     98
Christophe Gurley       95
Brad Moore              86
Mark Duncan             84
Christina Hanlin        74
Michael Kennemer        70
Terri Hamilton          67
Dawn Gallo              65
Randall Painter         64
Greg Malek              62
Michael Olton           44
Stephen Wolff           42
Name: Problem Manager, dtype: int64

### Created

Convert the Created column to a datetime type

In [14]:
data['Created'] = data['Created'].astype('datetime64[ns]')
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,3,FedEx Supply Chain,...,19,3,Application,Other,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,FedEx Freight Corporate,...,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,3,FedEx Supply Chain,...,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,3,FedEx Office,...,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",3,FedEx Office,...,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


### Created by

Looks like there may be some that have "admin" before the id number. I am just going to remove this prefix.

In [15]:
data['Created by'] = data['Created by'].map(lambda x: x.lstrip('admin'))
data['Created by'] = data['Created by'].map(lambda x: 'ID' + x)
# data['Created by'] = pd.to_numeric(data['Created by'])
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<50, 
                                   'Other') if x.name=='Created by' else x)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,3,FedEx Supply Chain,...,19,3,Application,Other,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,FedEx Freight Corporate,...,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,3,FedEx Supply Chain,...,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,3,FedEx Office,...,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",3,FedEx Office,...,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


In [31]:
data['Category'].value_counts()

Application       1122
Hardware           345
Other              233
Network             59
Inquiry / Help      39
Infrastructure      33
Security            14
Environment          7
Telephony            5
Name: Category, dtype: int64

### Impacted OpCos

Split Impacted OpCos column into list of OpCos instead of string

In [17]:
data['Impacted OpCos'] = data['Impacted OpCos'].map(lambda x: x.split(","))

AttributeError: 'float' object has no attribute 'split'

Weird.. Let's look for that float 

In [18]:
data[data['Impacted OpCos'].apply(np.isreal)]

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
117,Other,60058,Other,0,Server,2018-06-20 15:44:32,Other,Issue: FedEx Ground Publishing application is...,3,,...,2,3,Application,FedEx Services,INC010019190,"[fedex ground publishing application, invoices...",119363,59681.5,0,119363
136,Other,60173,Other,0,Server,2018-07-03 07:38:04,ID847287,FedEx.com Tracking Issue\n\n\n <EXECUTIVE NOT...,3,,...,2,3,Application,FedEx Services,INC010081260,"[experiencing tracking issues due, tnt custome...",20656,10328.0,0,20656
183,Other,61038,Other,0,Network,2018-07-19 03:26:56,ID5285658,"\n\nFrom: Navin Shetty \nSent: Wednesday, July...",3,,...,2,3,Hardware,FedEx Ground,INC010193087,[circuit upgrade],44677,22338.5,0,44677
336,Other,61405,Other,0,Server,2018-09-20 10:06:30,ID5285658,"From: Stephen Barch (OSV) \nSent: Friday, Augu...",3,,...,1,3,Hardware,FedEx Services,INC010329232,"[intermittent rate quote failures, proactive r...",4132609,2066304.5,0,4132609
358,Other,61447,Other,0,Network,2018-09-27 06:34:28,ID5163721,Users at ground location 0104/BRONX are report...,3,,...,1,3,Hardware,FedEx Services,INC010580900,"[utilizing offline inbound scan, reporting net...",8602,4301.0,0,8602
371,Other,61471,Other,0,Scanner,2018-10-02 12:10:48,Other,'-TM stated : SIM scanner is not working.\n\n\...,3,,...,1,3,Hardware,FedEx Office,INC010601708,[sim scanner],4265,2132.5,0,4265
404,Other,61581,Other,0,Computer,2018-10-18 13:53:03,ID973921,Pittsburg facilities received a pre action low...,3,,...,1,3,Hardware,FedEx Services,INC010667497,"[pre action low air pressure alarm, fire suppr...",797352,398676.0,0,797352
405,Other,61582,Other,0,Communication Device,2018-10-18 14:09:36,Other,CATEGORY : Application\n EAI ID : 3530712\n UU...,3,,...,1,3,Application,FedEx Ground,INC010695045,"[trailer monitoring unit, minor tmu tmumonitor]",40523,20261.5,0,40523
426,Other,61676,Other,0,Database,2018-10-26 03:59:48,Other,EMEA helpdesk reported some users from the ME...,3,,...,2,3,Application,FedEx Services,INC010727566,[facing issues receiving email || issueonly im...,107226,53613.0,0,107226
484,Other,61944,Other,0,Communication Device,2018-11-16 01:18:18,Other,VMWARE not working on Phone. Samsung A7 model....,3,,...,1,3,Application,FedEx Express,INC010750660,"[setup outlook inbox, samsung a7 model]",1832079,916039.5,0,1832079


Looks like the NaNs are the problem. Convert NaNs to empty strings.

In [19]:
data['Impacted OpCos'] = data['Impacted OpCos'].replace(np.nan, '', regex=True)
data[data['Impacted OpCos'].apply(np.isreal)]

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range


There's no longer any NaNs, so let's try the split again

In [20]:
data['Impacted OpCos'] = data['Impacted OpCos'].map(lambda x: x.split(","))
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Impacted OpCos,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,3,[FedEx Supply Chain],...,19,3,Application,Other,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,[FedEx Freight Corporate],...,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,3,[FedEx Supply Chain],...,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,3,[FedEx Office],...,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",3,[FedEx Office],...,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


Now we have the list, but we can't do anything with the list. Let's convert to dummy variables.

In [21]:
pd.get_dummies(data['Impacted OpCos'].apply(pd.Series).stack()).sum(level=0).head()

Unnamed: 0,Unnamed: 1,FedEx,FedEx Custom Critical,FedEx Express,FedEx Express APAC,FedEx Express Canada,FedEx Express Domestic,FedEx Express International,FedEx Express LAC,FedEx Express MEISA,...,FedEx SmartPost,FedEx Supply Chain,Federal Express (Aruba) N.V.,TNT APAC,TNT Australia,TNT Belgium,TNT Corporate,TNT Express,TNT Express Global Networks,TNT Slovenia
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


That will take a lot of further processing, so I am going to drop that for now.

In [22]:
data = data.drop('Impacted OpCos', axis=1)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Major Problem,...,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,3,False,...,19,3,Application,Other,,"[validation error occurred, alert message comi...",622772,39783.625,0,622772
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,True,...,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,3,False,...,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,3,True,...,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",3,True,...,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479


In [None]:
data.dtypes

### Parent

Convert Parent column to a boolean Has Parent column

In [23]:
data['Has Parent'] = data['Parent'].map(lambda x: not(np.isreal(x)))
data = data.drop('Parent', axis=1)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Impact,Major Problem,...,Updates,Urgency,Category,Company,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range,Has Parent
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,3,False,...,19,3,Application,Other,"[validation error occurred, alert message comi...",622772,39783.625,0,622772,False
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,3,True,...,32,3,Application,FedEx Services,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342,True
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,3,False,...,33,3,Application,Other,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390,True
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,3,True,...,24,3,Application,FedEx Services,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505,True
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",3,True,...,27,3,Application,FedEx Services,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479,True


### Convert categorical fields to be a category type

In [24]:
data['Assignment group'] = data['Assignment group'].astype('category')
data['Business service'] = data['Business service'].astype('category')
data['Category'] = data['Category'].astype('category')
data['Company'] = data['Company'].astype('category')
data['Created by'] = data['Created by'].astype('category')
data['Opened by'] = data['Opened by'].astype('category')
data['Problem Manager'] = data['Problem Manager'].astype('category')
data['Type'] = data['Type'].astype('category')
data['Updated by'] = data['Updated by'].astype('category')
data.dtypes

Problem Manager                category
Number                            int64
Assignment group               category
Business duration                 int64
Business service               category
Created                  datetime64[ns]
Created by                     category
Description                      object
Impact                            int64
Major Problem                      bool
Opened                   datetime64[ns]
Opened by                      category
Priority                          int64
Reassignment count                int64
Related Incidents                 int64
Short description                object
Type                           category
Updated                  datetime64[ns]
Updated by                     category
Updates                           int64
Urgency                           int64
Category                       category
Company                        category
Keywords - Short Desc            object
Duration Max                      int64


In [25]:
for i in range(len(data.iloc[0])):
    if len(data[data.iloc[:,i].isna()]) != 0:
        print('col', i, 'has na')

col 7 has na
col 10 has na


In [26]:
data = data.drop(['Description', 'Opened'], axis=1)

Now all columns have a proper data type; let's drop the NaNs and look at our clean data set

In [None]:
# data = data.dropna()
# data

### Adding categorical time of day feature

In [27]:
from datetime import *
def convert_datetime(ts):
    h = ts.hour
    return ('morning' if 5 <= h < 12
        else 'afternoon' if 12 <= h < 17
        else 'evening' if 17 <= h < 21
        else 'night')

data.insert(loc=6, column='Created Time', value=data['Created'].apply(convert_datetime))
data['Created Time'] = data['Created Time'].astype('category')
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created Time,Created by,Impact,Major Problem,...,Updates,Urgency,Category,Company,Keywords - Short Desc,Duration Max,Duration Mean,Duration Min,Duration Range,Has Parent
0,Naveen Kumar,62159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,afternoon,Other,3,False,...,19,3,Application,Other,"[validation error occurred, alert message comi...",622772,39783.625,0,622772,False
1,Mark Duncan,60146,Other,1084035,Computer,2018-06-28 19:56:05,evening,ID3667964,3,True,...,32,3,Application,FedEx Services,[multiple freight locations experienced issues...,4636059,545947.0,7717,4628342,True
2,Naveen Kumar,60758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,afternoon,ID5069733,3,False,...,33,3,Application,Other,"[call receiveorder api, tibco alert, parsing e...",2390,726.727273,0,2390,True
3,Christophe Gurley,60985,Other,576000,Server,2018-07-15 18:14:43,evening,ID5305519,3,True,...,24,3,Application,FedEx Services,"[fedex office center users, multiple centers]",2406878,435512.6,3373,2403505,True
4,Michael Kennemer,60147,Other,562179,Network,2018-06-29 07:15:36,morning,Other,3,True,...,27,3,Application,FedEx Services,"[corporate payroll users impacted, retail phon...",2197479,505680.571429,0,2197479,True


In [28]:
data = data.drop('Updated', axis=1)
data = data.drop('Updated by', axis=1)

Sort features alphabetically with problem number at the front

In [29]:
data = data.reindex(sorted(data.columns), axis=1)
cols = list(data.columns)
cols.insert(0, cols.pop(cols.index('Number')))
data = data.loc[:, cols]
data.head()

Unnamed: 0,Number,Assignment group,Business duration,Business service,Category,Company,Created,Created Time,Created by,Duration Max,...,Major Problem,Opened by,Priority,Problem Manager,Reassignment count,Related Incidents,Short description,Type,Updates,Urgency
0,62159,FSC_ITL3O2STIBCO,0,Computer,Application,Other,2018-12-03 12:44:49,afternoon,Other,622772,...,False,Naveen Kumar,5,Naveen Kumar,1,15,FSC- EIB- A validation error occurred for aler...,Standard,19,3
1,60146,Other,1084035,Computer,Application,FedEx Services,2018-06-28 19:56:05,evening,ID3667964,4636059,...,True,D'Zundra Green,5,Mark Duncan,2,10,Multiple freight locations experienced issues ...,ITCC,32,3
2,60758,FSC_ITL3O2STIBCO,0,Computer,Application,Other,2018-07-13 15:17:09,afternoon,ID5069733,2390,...,False,Admin-Arturo Reyes,5,Naveen Kumar,0,10,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,33,3
3,60985,Other,576000,Server,Application,FedEx Services,2018-07-15 18:14:43,evening,ID5305519,2406878,...,True,Ashish Bisht,5,Christophe Gurley,2,9,FedEx Office center users at multiple centers ...,ITCC,24,3
4,60147,Other,562179,Network,Application,FedEx Services,2018-06-29 07:15:36,morning,Other,2197479,...,True,Brad Moore,5,Michael Kennemer,2,6,"Retail Phones, Payment switch and corporate pa...",ITCC,27,3


### Export most recently cleaned data to csv

In [30]:
data.to_csv('problems_cleaned.csv', index=False)

In [33]:
data_preproc = data.drop(['Created', 'Duration Max', 'Duration Min', 'Duration Range',
                  'Short description', 'Keywords - Short Desc'], axis=1)

In [34]:
data_preproc = data_preproc.drop('Number', axis=1)

In [35]:
data_preproc['Problem Manager'] = data_preproc['Problem Manager'].replace("\'",'')

In [45]:
data_preproc['Opened by'] = data_preproc['Opened by'].str.replace("\'",'')

In [65]:
subsampled = data_preproc[data_preproc['Category']!='Application']
second_freq = len(subsampled[subsampled['Category']==subsampled['Category'].
                             value_counts().idxmax()])
subsampled2 = data_preproc[data_preproc['Category']
                           =='Application'].sample(int(second_freq*1.5))
final_data = subsampled.append(subsampled2)
final_data

Unnamed: 0,Assignment group,Business duration,Business service,Category,Company,Created Time,Created by,Duration Mean,Has Parent,Impact,Major Problem,Opened by,Priority,Problem Manager,Reassignment count,Related Incidents,Type,Updates,Urgency
5,Other,0,Server,Infrastructure,FedEx Services,afternoon,ID973921,1.513453e+05,True,2,True,Stephen Barch,2,Michael Kennemer,2,6,ITCC,52,1
6,FXS_EIS_GLB_ProblemMgmt,0,Server,Hardware,FedEx Services,afternoon,ID5305519,4.755157e+04,True,2,False,Ashish Bisht,3,Christopher Barber,1,6,ITCC,4,2
10,Other,0,Computer,Infrastructure,FedEx Ground,morning,ID973921,7.628800e+03,True,3,True,Stephen Barch,4,Eli Smith,2,4,ITCC,15,2
13,Other,0,Database,Infrastructure,FedEx Services,evening,ID5305519,3.766460e+04,True,3,True,Ashish Bisht,4,Michael Kennemer,5,4,ITCC,34,2
18,Other,0,Server,Hardware,FedEx Express,morning,Other,1.277994e+05,True,2,True,Mandeep Khati,3,Terri Hamilton,2,4,ITCC,34,2
19,Other,1843200,Communication Device,Inquiry / Help,FedEx Office,morning,ID847287,1.957679e+06,True,3,True,Logan Simmons,5,Terri Hamilton,2,3,ITCC,30,3
21,Other,86400,Server,Infrastructure,FedEx Services,morning,ID5285658,6.721500e+04,True,3,True,Shafi Ahmed,5,Eli Smith,2,3,Standard,8,3
22,Other,144000,Server,Inquiry / Help,FedEx Express,night,ID5285658,1.655278e+05,True,3,True,Shafi Ahmed,5,Eli Smith,2,3,ITCC,9,3
35,Other,0,Server,Hardware,FedEx Services,morning,Other,7.965410e+05,True,2,True,Brad Moore,4,Christopher Barber,2,2,ITCC,17,3
43,Other,1199537,Other,Hardware,FedEx Services,evening,ID3667964,1.723105e+06,True,3,True,DZundra Green,5,Michael Kennemer,2,2,ITCC,14,3


In [66]:
final_data = final_data.sample(frac=1).reset_index(drop=True)
final_data.to_csv('problems_preprocessed.csv', index=False)