In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import seaborn
from matplotlib import pyplot
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from rake_nltk import Rake

### Preprocessing

Import data as data frame

In [2]:
problemfile = 'problems_2019-03-21.xlsx'
df = pd.read_excel(problemfile)
df

Unnamed: 0,Problem Manager,Number,Active,Activity due,Additional assignee list,Approval,Approval history,Approval set,Assigned to,Assignment group,...,Work notes,Work notes list,Workaround,Details,IT WAR Walk On,Quality Improvement Project,Director,Follow Up,Send Email,VP
0,Naveen Kumar,PRB0062159,True,,,Not Yet Requested,,,,FSC_ITL3O2STIBCO,...,,,,,NaT,NaT,,,,
1,Mark Duncan,PRB0060146,False,,,Not Yet Requested,,,V Sox,FXF_SPT_US_FIELDLINEHAUL,...,2018-08-22 05:40:24 - Eli Smith (Work notes)\n...,,2018-06-28 19:56:05 - D'Zundra Green (Workarou...,,NaT,NaT,,,,
2,Naveen Kumar,PRB0060758,False,,,Not Yet Requested,,,,FSC_ITL3O2STIBCO,...,,,,,NaT,NaT,,,,
3,Christophe Gurley,PRB0060985,False,,,Not Yet Requested,,,Scott Dubak,FXO_SA_US_WindowsServer,...,2018-08-12 14:49:21 - Christophe Gurley (Work ...,,2018-07-15 18:14:43 - Ashish Bisht (Workaround...,,NaT,NaT,,,,
4,Michael Kennemer,PRB0060147,False,,,Not Yet Requested,,,Robert Bumpus,FXS_FIREWALL,...,2018-07-27 15:09:39 - Michael Kennemer (Work n...,,2018-06-29 07:24:25 - Brad Moore (Workaround)\...,,NaT,NaT,,,,
5,Michael Kennemer,PRB0062412,True,,,Not Yet Requested,,,Mahesh Pillutla,FXF_SPT_US_FreightDataSvcs,...,,,2019-01-06 12:00:08 - Stephen Barch (Workaroun...,Connection pooling errors,NaT,NaT,Rita Moore,,,Ann Higgins
6,Christopher Barber,PRB0063553,False,,,Not Yet Requested,,,,FXS_EIS_GLB_ProblemMgmt,...,,,,,NaT,NaT,,,,
7,Terri Hamilton,PRB0060194,False,,,Not Yet Requested,,,William Redmond,FXS_SPT_US_DOTCOMSHIPPING,...,2018-09-13 09:57:53 - Michael Kennemer (Work n...,,2018-07-06 20:12:25 - Sean Green (Workaround)\...,,NaT,NaT,,,,
8,Eli Smith,PRB0061073,False,,,Not Yet Requested,,,Douglas Clinger,FXF_SPT_SEFS,...,2018-07-26 03:00:28 - Eli Smith (Work notes)\n...,,2018-07-24 03:21:28 - Eli Smith (Workaround)\n...,,NaT,NaT,,,,
9,Eli Smith,PRB0061373,False,,,Not Yet Requested,,,Matthew Schwab,FXS_SPT_GLB_SharePoint,...,2018-12-11 14:41:16 - Randall Painter (Work no...,,2018-09-15 07:26:49 - Divakar Durgapal (Workar...,,NaT,NaT,,,,


Drop features that have all NA values

Drop features that have more than 10% NA values (can modify as needed)

Drop features that have all the same value

Drop observations with category = NA

'Active', 'Known error', 'Problem state', and 'State' were all dropped as they did not seem to add meaningful information

'Parent' was added again because we create meaningful feature from it later

In [11]:
dfcols = set(df.columns)
print('feature count = ', len(df.columns))
data = df.dropna(axis=1, how='all')
dfcols2 = set(data.columns)
print('all NAs = ', dfcols - dfcols2)
print('features minus all NA features = ', len(data.columns))
data = df.dropna(axis=1, thresh=df.shape[0]*0.90)
dfcols3 = set(data.columns)
dfcols3.add('Category')
dfcols3.add('Company')
dfcols3.add('Parent')
print('features minus 10%+ NA features = ', len(data.columns))
for col in data:
    if len(data[col].value_counts()) < 2:
        data = data.drop(col, axis=1)
print('features after same value drop = ', len(data.columns))
data = data.drop(['Active', 'Known error', 'Problem state', 'State'], axis=1)
data['Category'] = df['Category']
data['Company'] = df['Company']
data['Parent'] = df['Parent']
data = data.dropna(subset=['Category'], axis = 0, how ='any')
data = data.reset_index(drop=True)
print('num final features = ', len(data.columns))
print('num final observations = ', len(data))
print('final features = ', data.columns)

feature count =  111
all NAs =  {'Actual start', 'Respond Date', 'Delivery task', 'Group list', 'Actual end', 'Correlation ID', 'Problem Owner Comments', 'Contract', 'Approval history', 'Location', 'Approval set', 'Contact type', 'Correlation display', 'Additional comments', 'User input', 'Delivery plan', 'Skills', 'Additional assignee list', 'Due date', 'Expected start', 'SLA due', 'Order', 'Service', 'Service offering', 'Follow up', 'Reporting OpCo', 'Activity due'}
features minus all NA features =  84
features minus 10%+ NA features =  38
features after same value drop =  27
num final features =  26
num final observations =  1624
final features =  Index(['Problem Manager', 'Number', 'Assignment group', 'Business duration',
       'Business service', 'Created', 'Created by', 'Description', 'Duration',
       'Impact', 'Impacted OpCos', 'Major Problem', 'Opened', 'Opened by',
       'Priority', 'Reassignment count', 'Related Incidents',
       'Short description', 'Type', 'Updated', '

In [4]:
data.to_csv('problem_pdsm_simple.csv', index=False)

### Create word vector using bag of words model
Each problem's short description is passed as a "document"

In [5]:
tfv = TfidfVectorizer()
X = tfv.fit_transform([data['Short description'].loc[i] for i in range(len(data))])
X.shape

(1624, 3271)

Drop numbers and nonwords from tfidf word vectors

In [6]:
docvec = pd.DataFrame(X.A, columns=tfv.get_feature_names())
docvec = docvec.drop(docvec.filter(regex='\d+|\_+', axis=1).columns, axis=1)
docvec.head()

Unnamed: 0,abending,abends,ability,able,abnormal,abort,aborted,about,abt,acars,...,yard,yesterday,yms,your,zebra,zero,zips,zoma,zone,zp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
docvec['AppCategory'] = data['Category']
# docvec.loc[docvec.AppCategory!='Application', 'AppCategory'] = 'Other'
docvec.to_csv('problems_descriptionvectors.csv', index=False)

Drop infrequent classes

In [8]:
docvec = docvec[docvec['AppCategory']!='Network']
docvec = docvec[docvec['AppCategory']!='Inquiry / Help']
docvec = docvec[docvec['AppCategory']!='Infrastructure']
docvec = docvec[docvec['AppCategory']!='Security']
docvec = docvec[docvec['AppCategory']!='Environment']
docvec = docvec[docvec['AppCategory']!='Telephony']
docvec['AppCategory'].value_counts()

Application    1122
Hardware        345
Name: AppCategory, dtype: int64

In [9]:
docvec.to_csv('problems_descriptionvectors_2cats.csv', index=False)

### Extract keywords from descriptions in full dataset
Adds Short Description and Keywords features to above dataset

In [10]:
def extract_keywords(text):
    r = Rake(min_length=2, max_length=8)
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[0:4]
data['Keywords - Short Desc'] = data['Short description'].apply(extract_keywords)
# data['Keywords - Desc'] = data['Description'].apply(extract_keywords)

## Clean the data

Find out default types for the columns

In [None]:
data.dtypes

### Number column

Remove the PRB prefix from the Number column and convert it to a number

In [None]:
# data['Number'] = data['Number'].map(lambda x: x.lstrip('PRB'))
# data['Number'] = pd.to_numeric(data['Number'])
# data

In [None]:
data.dtypes

### Priority, Impact, and Urgency

Take the first character (number) from each and convert to numeric

In [11]:
data['Priority'] = data['Priority'].map(lambda x: x[0])
data['Priority'] = pd.to_numeric(data['Priority'])

data['Impact'] = data['Impact'].map(lambda x: x[0])
data['Impact'] = pd.to_numeric(data['Impact'])

data['Urgency'] = data['Urgency'].map(lambda x: x[0])
data['Urgency'] = pd.to_numeric(data['Urgency'])

data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,,,"[validation error occurred, alert message comi..."
1,Mark Duncan,PRB0060146,FXF_SPT_US_FIELDLINEHAUL,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,,INC010530920,"[call receiveorder api, tibco alert, parsing e..."
3,Christophe Gurley,PRB0060985,FXO_SA_US_WindowsServer,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]"
4,Michael Kennemer,PRB0060147,FXS_FIREWALL,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon..."


In [None]:
data.dtypes

### Categorical Fields

Check out category counts

In [None]:
data['Assignment group'].value_counts()

In [None]:
data['Business service'].value_counts()

In [None]:
# data['Category'].value_counts()

In [None]:
# data['Company'].value_counts()

In [None]:
# data['Problem Manager'].value_counts().nlargest(9)

In [None]:
# data['Type'].value_counts()

Cut categories with low counts and add together to make an "Other" category

In [12]:
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<25, 'Other') if x.name=='Business service' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<11, 'Other') if x.name=='Company' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<40, 'Other') if x.name=='Problem Manager' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<11, 'Other') if x.name=='Type' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<20, 'Other') if x.name=='Assignment group' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<11, 'Other') if x.name=='Opened by' else x)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,,,"[validation error occurred, alert message comi..."
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,,INC010530920,"[call receiveorder api, tibco alert, parsing e..."
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]"
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon..."


Check NA statistics

In [13]:
print('Assignment group NAs = ', len(data[data['Assignment group'].isna()]))
print('Business service NAs = ', len(data[data['Business service'].isna()]))
print('Category NAs = ', len(data[data['Category'].isna()]))
print('Company NAs = ', len(data[data['Company'].isna()]))
print('Opened by NAs = ', len(data[data['Opened by'].isna()]))
print('Problem Manager NAs = ', len(data[data['Problem Manager'].isna()]))
print('Type NAs = ', len(data[data['Type'].isna()]))

Assignment group NAs =  15
Business service NAs =  0
Category NAs =  0
Company NAs =  690
Opened by NAs =  0
Problem Manager NAs =  1
Type NAs =  19


Fill NAs with 'Other'

In [14]:
data['Assignment group'] = data['Assignment group'].fillna('Other')
data['Business service'] = data['Business service'].fillna('Other')
data['Company'] = data['Company'].fillna('Other')
data['Opened by'] = data['Opened by'].fillna('Other')
data['Problem Manager'] = data['Problem Manager'].fillna('Other')
data['Type'] = data['Type'].fillna('Other')

In [None]:
data['Problem Manager'].value_counts()

#### Load incidents dataset

In [15]:
incidents = pd.read_excel('incidents_2019-03-21.xlsx')
incidents.head()

Unnamed: 0,Number,Problem,Affected Company,Business service,ALERTING (MIN),Awareness (Min),Acknowledge (Min),Opened,Created,Closed,...,User Location,User Network,User input,VZ Correlation ID,Vendor Exception,Vendor Incident Number,Vendor Name,Watch list,Work notes,Work notes list
0,INC010068680,PRB0060003,FedEx Express,Communication Device,,,,2018-06-17 03:59:28,2018-06-28 00:53:59,2018-06-28 00:58:10,...,,,,,,,,,,
1,INC010000197,PRB0060003,FedEx Express,Communication Device,0.0,66.0,66.0,2018-06-17 03:59:28,2018-06-17 04:04:04,2018-08-04 14:00:48,...,,,,,,,,,2018-08-02 13:32:12 - Stephen Barch (Work note...,
2,INC010003952,PRB0060019,FedEx Services,Scanner,0.0,1.0,1.0,2018-06-18 08:24:50,2018-06-18 08:41:13,2018-06-20 14:00:42,...,,,,,,,,,2018-06-18 09:23:54 - Brad Moore (Work notes)\...,
3,INC010269074,PRB0060026,FedEx Services,Database,,,,2018-06-18 15:40:15,2018-07-31 16:55:15,2018-08-04 15:00:23,...,,,,,,,,,,
4,INC010269078,PRB0060026,FedEx Services,Database,,,,2018-06-18 15:40:15,2018-07-31 16:55:19,2018-08-04 15:00:38,...,,,,,,,,,,


### Created

Convert the Created column to a datetime type

In [20]:
data['Created'] = data['Created'].astype('datetime64[ns]')
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,795580,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,Other,,"[validation error occurred, alert message comi..."
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,admin5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e..."
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]"
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,483039,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon..."


### Created by

Looks like there may be some that have "admin" before the id number. I am just going to remove this prefix.

In [21]:
data['Created by'] = data['Created by'].map(lambda x: x.lstrip('admin'))
data['Created by'] = data['Created by'].map(lambda x: 'ID' + x)
# data['Created by'] = pd.to_numeric(data['Created by'])
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<50, 
                                   'Other') if x.name=='Created by' else x)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,Other,,"[validation error occurred, alert message comi..."
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e..."
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]"
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon..."


In [22]:
data['Category'].value_counts()

Application       1122
Hardware           345
Network             59
Inquiry / Help      39
Infrastructure      33
Security            14
Environment          7
Telephony            5
Name: Category, dtype: int64

### Impacted OpCos

Split Impacted OpCos column into list of OpCos instead of string

In [23]:
data['Impacted OpCos'] = data['Impacted OpCos'].map(lambda x: x.split(","))

AttributeError: 'float' object has no attribute 'split'

Weird.. Let's look for that float 

In [24]:
data[data['Impacted OpCos'].apply(np.isreal)]

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
117,Other,PRB0060058,Other,0,Server,2018-06-20 15:44:32,Other,Issue: FedEx Ground Publishing application is...,0,3,...,FedEx Ground Publishing application is impacte...,Other,2018-06-23 16:00:52,system,2,3,Application,FedEx Services,INC010019190,"[fedex ground publishing application, invoices..."
136,Other,PRB0060173,Other,0,Server,2018-07-03 07:38:04,ID847287,FedEx.com Tracking Issue\n\n\n <EXECUTIVE NOT...,0,3,...,TNT customers are experiencing tracking issues...,ITCC,2018-07-03 07:38:36,847287,2,3,Application,FedEx Services,INC010081260,"[experiencing tracking issues due, tnt custome..."
183,Other,PRB0061038,Other,0,Network,2018-07-19 03:26:56,ID5285658,"\n\nFrom: Navin Shetty \nSent: Wednesday, July...",0,3,...,fxg769route | Circuit upgrade,Other,2018-07-21 04:00:31,system,2,3,Hardware,FedEx Ground,INC010193087,[circuit upgrade]
336,Other,PRB0061405,Other,0,Server,2018-09-20 10:06:30,ID5285658,"From: Stephen Barch (OSV) \nSent: Friday, Augu...",0,3,...,Intermittent rate quote failures (proactive re...,Other,2018-09-20 10:06:30,5285658,1,3,Hardware,FedEx Services,INC010329232,"[intermittent rate quote failures, proactive r..."
358,Other,PRB0061447,Other,0,Network,2018-09-27 06:34:28,ID5163721,Users at ground location 0104/BRONX are report...,0,3,...,Users at ground location 0104/BRONX are report...,Other,2018-09-27 06:34:28,5163721,1,3,Hardware,FedEx Services,INC010580900,"[utilizing offline inbound scan, reporting net..."
371,Other,PRB0061471,Other,0,Scanner,2018-10-02 12:10:48,Other,'-TM stated : SIM scanner is not working.\n\n\...,0,3,...,FXO : SIM scanner is not working.,Other,2018-10-02 12:10:48,752021,1,3,Hardware,FedEx Office,INC010601708,[sim scanner]
404,Other,PRB0061581,Other,0,Computer,2018-10-18 13:53:03,ID973921,Pittsburg facilities received a pre action low...,0,3,...,Pittsburg facilities received a pre action low...,Other,2018-10-18 13:53:03,973921,1,3,Hardware,FedEx Services,INC010667497,"[pre action low air pressure alarm, fire suppr..."
405,Other,PRB0061582,Other,0,Communication Device,2018-10-18 14:09:36,Other,CATEGORY : Application\n EAI ID : 3530712\n UU...,0,3,...,3530712 (TRAILER MONITORING UNIT) Minor TMU TM...,Other,2018-10-18 14:09:37,2668981,1,3,Application,FedEx Ground,INC010695045,"[trailer monitoring unit, minor tmu tmumonitor]"
426,Other,PRB0061676,Other,0,Database,2018-10-26 03:59:48,Other,EMEA helpdesk reported some users from the ME...,0,3,...,EMEA helpdesk reported some users from the ME...,Other,2018-10-26 04:10:23,192015,2,3,Application,FedEx Services,INC010727566,[facing issues receiving email || issueonly im...
484,Other,PRB0061944,Other,0,Communication Device,2018-11-16 01:18:18,Other,VMWARE not working on Phone. Samsung A7 model....,0,3,...,VMWARE not working on Phone. Samsung A7 model....,Other,2018-11-16 01:18:18,882238,1,3,Application,FedEx Express,INC010750660,"[setup outlook inbox, samsung a7 model]"


Looks like the NaNs are the problem. Convert NaNs to empty strings.

In [25]:
data['Impacted OpCos'] = data['Impacted OpCos'].replace(np.nan, '', regex=True)
data[data['Impacted OpCos'].apply(np.isreal)]

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc


There's no longer any NaNs, so let's try the split again

In [26]:
data['Impacted OpCos'] = data['Impacted OpCos'].map(lambda x: x.split(","))
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,Other,,"[validation error occurred, alert message comi..."
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e..."
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]"
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon..."


Now we have the list, but we can't do anything with the list. Let's convert to dummy variables.

In [27]:
pd.get_dummies(data['Impacted OpCos'].apply(pd.Series).stack()).sum(level=0).head()

Unnamed: 0,Unnamed: 1,FedEx,FedEx Custom Critical,FedEx Express,FedEx Express APAC,FedEx Express Canada,FedEx Express Domestic,FedEx Express International,FedEx Express LAC,FedEx Express MEISA,...,FedEx SmartPost,FedEx Supply Chain,Federal Express (Aruba) N.V.,TNT APAC,TNT Australia,TNT Belgium,TNT Corporate,TNT Express,TNT Express Global Networks,TNT Slovenia
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


That will take a lot of further processing, so I am going to drop that for now.

In [28]:
data = data.drop('Impacted OpCos', axis=1)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Parent,Keywords - Short Desc
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,Other,,"[validation error occurred, alert message comi..."
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,INC010074611,[multiple freight locations experienced issues...
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,Other,INC010530920,"[call receiveorder api, tibco alert, parsing e..."
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,INC010238011,"[fedex office center users, multiple centers]"
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,INC010073511,"[corporate payroll users impacted, retail phon..."


In [None]:
data.dtypes

### Parent

Convert Parent column to a boolean Has Parent column

In [29]:
data['Has Parent'] = data['Parent'].map(lambda x: not(np.isreal(x)))
data = data.drop('Parent', axis=1)
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created by,Description,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Keywords - Short Desc,Has Parent
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,Other,Description: Critical O-X---- 10:24:24 11/29/...,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,Other,"[validation error occurred, alert message comi...",False
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,ID3667964,<EXECUTIVE NOTIFY: FEDEX-SEV3>\n06/28/18 16:07...,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,[multiple freight locations experienced issues...,True
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,ID5069733,Issue : L2 support is receiving alerts which i...,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,Other,"[call receiveorder api, tibco alert, parsing e...",True
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,ID5305519,FedEx Office center users at multiple centers ...,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,"[fedex office center users, multiple centers]",True
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,Other,"Retail Phones, Payment switch and corporate pa...",2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,"[corporate payroll users impacted, retail phon...",True


### Convert categorical fields to be a category type

In [30]:
data['Assignment group'] = data['Assignment group'].astype('category')
data['Business service'] = data['Business service'].astype('category')
data['Category'] = data['Category'].astype('category')
data['Company'] = data['Company'].astype('category')
data['Created by'] = data['Created by'].astype('category')
data['Opened by'] = data['Opened by'].astype('category')
data['Problem Manager'] = data['Problem Manager'].astype('category')
data['Type'] = data['Type'].astype('category')
data['Updated by'] = data['Updated by'].astype('category')
data.dtypes

Problem Manager                category
Number                           object
Assignment group               category
Business duration                 int64
Business service               category
Created                  datetime64[ns]
Created by                     category
Description                      object
Duration                          int64
Impact                            int64
Major Problem                      bool
Opened                   datetime64[ns]
Opened by                      category
Priority                          int64
Reassignment count                int64
Related Incidents                 int64
Short description                object
Type                           category
Updated                  datetime64[ns]
Updated by                     category
Updates                           int64
Urgency                           int64
Category                       category
Company                        category
Keywords - Short Desc            object


In [31]:
for i in range(len(data.iloc[0])):
    if len(data[data.iloc[:,i].isna()]) != 0:
        print('col', i, 'has na')

col 7 has na


In [32]:
data = data.drop(['Description', 'Opened'], axis=1)

Now all columns have a proper data type; let's drop the NaNs and look at our clean data set

In [None]:
# data = data.dropna()
# data

### Adding categorical time of day feature

In [33]:
from datetime import *
def convert_datetime(ts):
    h = ts.hour
    return ('morning' if 5 <= h < 12
        else 'afternoon' if 12 <= h < 17
        else 'evening' if 17 <= h < 21
        else 'night')

data.insert(loc=6, column='Created Time', value=data['Created'].apply(convert_datetime))
data['Created Time'] = data['Created Time'].astype('category')
data.head()

Unnamed: 0,Problem Manager,Number,Assignment group,Business duration,Business service,Created,Created Time,Created by,Duration,Impact,...,Short description,Type,Updated,Updated by,Updates,Urgency,Category,Company,Keywords - Short Desc,Has Parent
0,Naveen Kumar,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,2018-12-03 12:44:49,afternoon,Other,0,3,...,FSC- EIB- A validation error occurred for aler...,Standard,2019-02-14 02:46:37,3616934,19,3,Application,Other,"[validation error occurred, alert message comi...",False
1,Mark Duncan,PRB0060146,Other,1084035,Computer,2018-06-28 19:56:05,evening,ID3667964,4636059,3,...,Multiple freight locations experienced issues ...,ITCC,2019-01-25 00:44:13,admin5041253,32,3,Application,FedEx Services,[multiple freight locations experienced issues...,True
2,Naveen Kumar,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,2018-07-13 15:17:09,afternoon,ID5069733,0,3,...,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,2019-01-25 00:44:34,admin5041253,33,3,Application,Other,"[call receiveorder api, tibco alert, parsing e...",True
3,Christophe Gurley,PRB0060985,Other,576000,Server,2018-07-15 18:14:43,evening,ID5305519,2406878,3,...,FedEx Office center users at multiple centers ...,ITCC,2019-01-25 00:44:42,admin5041253,24,3,Application,FedEx Services,"[fedex office center users, multiple centers]",True
4,Michael Kennemer,PRB0060147,Other,562179,Network,2018-06-29 07:15:36,morning,Other,2197479,3,...,"Retail Phones, Payment switch and corporate pa...",ITCC,2019-01-25 00:44:13,admin5041253,27,3,Application,FedEx Services,"[corporate payroll users impacted, retail phon...",True


In [34]:
data = data.drop('Updated', axis=1)
data = data.drop('Updated by', axis=1)

Sort features alphabetically with problem number at the front

In [35]:
data = data.reindex(sorted(data.columns), axis=1)
cols = list(data.columns)
cols.insert(0, cols.pop(cols.index('Number')))
data = data.loc[:, cols]
data.head()

Unnamed: 0,Number,Assignment group,Business duration,Business service,Category,Company,Created,Created Time,Created by,Duration,...,Major Problem,Opened by,Priority,Problem Manager,Reassignment count,Related Incidents,Short description,Type,Updates,Urgency
0,PRB0062159,FSC_ITL3O2STIBCO,0,Computer,Application,Other,2018-12-03 12:44:49,afternoon,Other,0,...,False,Naveen Kumar,5,Naveen Kumar,1,15,FSC- EIB- A validation error occurred for aler...,Standard,19,3
1,PRB0060146,Other,1084035,Computer,Application,FedEx Services,2018-06-28 19:56:05,evening,ID3667964,4636059,...,True,D'Zundra Green,5,Mark Duncan,2,10,Multiple freight locations experienced issues ...,ITCC,32,3
2,PRB0060758,FSC_ITL3O2STIBCO,0,Computer,Application,Other,2018-07-13 15:17:09,afternoon,ID5069733,0,...,False,Admin-Arturo Reyes,5,Naveen Kumar,0,10,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,33,3
3,PRB0060985,Other,576000,Server,Application,FedEx Services,2018-07-15 18:14:43,evening,ID5305519,2406878,...,True,Ashish Bisht,5,Christophe Gurley,2,9,FedEx Office center users at multiple centers ...,ITCC,24,3
4,PRB0060147,Other,562179,Network,Application,FedEx Services,2018-06-29 07:15:36,morning,Other,2197479,...,True,Brad Moore,5,Michael Kennemer,2,6,"Retail Phones, Payment switch and corporate pa...",ITCC,27,3


### Export most recently cleaned data to csv

In [36]:
data.to_csv('problems_cleaned.csv', index=False)

In [37]:
data_preproc = data.drop(['Created', 'Short description', 'Keywords - Short Desc'], axis=1)

In [38]:
data_preproc = data_preproc.drop('Number', axis=1)

In [39]:
data_preproc['Problem Manager'] = data_preproc['Problem Manager'].replace("\'",'')

In [40]:
data_preproc['Opened by'] = data_preproc['Opened by'].str.replace("\'",'')

In [43]:
data_preproc.to_csv('problems_preprocessed.csv', index=False)
data_preproc['Number'] = data['Number']

### Get avg, min, and max duration of related incidents for each problem

In [44]:
durations = incidents[['Problem','Duration']]
durations = durations[durations['Duration']!=0]
durations = durations.rename(columns={'Problem':'Number'})
durations2 = data_preproc[['Number', 'Duration']]
durations2 = durations2[durations2['Duration']!=0]
durations = durations.append(durations2, sort=False)
durations = durations.sort_values(by='Number', axis=0).reset_index().drop('index',axis=1)
# durations.loc[durations.Duration_y == 0, 'Duration'] = durations['Duration_x'] 
# durations.loc[durations.Duration_y != 0, 'Duration'] = durations['Duration_y']
# durations['Duration'] = durations['Duration'].astype(int)
durations.head()

Unnamed: 0,Number,Duration
0,PRB0060003,939522
1,PRB0060003,4009074
2,PRB0060003,691713
3,PRB0060006,590877
4,PRB0060014,16338265


In [45]:
# durations['Duration Max'] = durations.Number.map(durations.groupby(['Number'])['Duration'].max())
durations['Duration Mean'] = durations.Number.map(durations.groupby(['Number'])['Duration'].mean())
# durations['Duration Min'] = durations.Number.map(durations.groupby(['Number'])['Duration'].min())
# durations['Duration Range'] = durations['Duration Max'] - durations['Duration Min']
durations = durations.drop_duplicates('Number').drop('Duration', axis=1)
durations.head()

Unnamed: 0,Number,Duration Mean
0,PRB0060003,1880103.0
3,PRB0060006,590877.0
4,PRB0060014,16338260.0
5,PRB0060019,3150236.0
7,PRB0060026,3820638.0


In [52]:
durationsdata = data_preproc.copy()
durationsdata = durationsdata.merge(durations, how='left', on='Number')
durationsdata = durationsdata[~durationsdata['Duration Mean'].isna()]
durationsdata = durationsdata.drop(['Number', 'Business duration', 'Duration'], axis=1)
durationsdata.to_csv('problems_durations.csv', index=False)

In [53]:
durationsdata = durationsdata[durationsdata['Category']!='Network']
durationsdata = durationsdata[durationsdata['Category']!='Inquiry / Help']
durationsdata = durationsdata[durationsdata['Category']!='Infrastructure']
durationsdata = durationsdata[durationsdata['Category']!='Security']
durationsdata = durationsdata[durationsdata['Category']!='Environment']
durationsdata = durationsdata[durationsdata['Category']!='Telephony']
durationsdata.to_csv('problems_durations_2cats.csv', index=False)

In [None]:
subsampled = data_preproc[data_preproc['Category']!='Application']
second_freq = len(subsampled[subsampled['Category']==subsampled['Category'].
                             value_counts().idxmax()])
subsampled2 = data_preproc[data_preproc['Category']
                           =='Application'].sample(int(second_freq*1.5))
final_data = subsampled.append(subsampled2)
final_data

In [None]:
final_data = final_data.sample(frac=1).reset_index(drop=True)
final_data.to_csv('problems_subsampled.csv', index=False)

In [55]:
twocats = data_preproc.copy()
twocats = twocats.drop(['Number', 'Business duration', 'Duration'], axis=1)
twocats['Category'] = twocats['Category'].astype('object')
twocats = twocats[twocats['Category']!='Network']
twocats = twocats[twocats['Category']!='Inquiry / Help']
twocats = twocats[twocats['Category']!='Infrastructure']
twocats = twocats[twocats['Category']!='Security']
twocats = twocats[twocats['Category']!='Environment']
twocats = twocats[twocats['Category']!='Telephony']
twocats['Category'] = twocats['Category'].astype('category')
twocats.to_csv('problems_2categories.csv', index=False)

In [None]:
twocats_sub = final_data.copy()
twocats_sub['Category'] = twocats_sub['Category'].astype('object')
twocats_sub.loc[twocats_sub.Category!='Application', 'Category'] = 'Other'
twocats_sub['Category'] = twocats_sub['Category'].astype('category')
twocats_sub.to_csv('problems_2categories_subsampled.csv', index=False)

In [None]:
twocats_sub