In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from rake_nltk import Rake

### Preprocessing

Import data as data frame

In [None]:
problemfile = 'problems_2019-03-21.xlsx'
df = pd.read_excel(problemfile)
df

In [None]:
data = df.dropna(axis=1, how='all')
data = df.dropna(axis=1, thresh=df.shape[0]*0.90)
for col in data:
    if len(data[col].value_counts()) < 2:
        data = data.drop(col, axis=1)
data = data.drop(['Active', 'Known error', 'Problem state', 'State'], axis=1)
data['Parent'] = df['Parent']
data.to_csv('problem_pdsm_simple.csv', index=False)

### Extract keywords from descriptions in full dataset
Adds Short Description and Keywords features to above dataset

In [None]:
def extract_keywords(text):
    r = Rake(min_length=2, max_length=8)
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[0:4]
data['Keywords'] = data['Short description'].apply(extract_keywords)

In [None]:
data = data.reindex(sorted(data.columns), axis=1)
data.head()

#### Load incidents dataset

In [None]:
incidents = pd.read_excel('incidents_2019-03-21.xlsx')
incidents.head()

### Add assignment group from incidents to problems dataset

In [None]:
# assign_groups = incidents[['Problem', 'Assignment group']]
# assign_groups = assign_groups.rename(columns={'Problem':'Number'}).drop_duplicates('Number')
# # assign_groups.head()
# data = data.merge(assign_groups, how='left', on='Number')
# data.head()

### Get avg, min, and max duration of related incidents for each problem

In [None]:
durations = incidents[['Problem','Duration']]
durations = durations.rename(columns={'Problem':'Number'})
durations2 = data[['Number', 'Duration']]
durations = durations.merge(durations2, how='left', on='Number')
durations

In [None]:
# durations.groupby(['Number']).mean()
durations['Duration Max'] = durations.Number.map(durations.groupby(['Number'])['Duration'].max())
durations['Duration Mean'] = durations.Number.map(durations.groupby(['Number'])['Duration'].mean())
durations['Duration Min'] = durations.Number.map(durations.groupby(['Number'])['Duration'].min())
durations['Duration Range'] = durations['Duration Max'] - durations['Duration Min']
durations = durations.drop_duplicates('Number').drop('Duration', axis=1)
durations.head()

In [None]:
durations2 = data[['Number', 'Duration']]

In [None]:
data = data.merge(durations, how='left', on='Number')
data.head()

### Find similar keywords within related incidents

## Clean the data

Find out default types for the columns

In [None]:
data.dtypes

### Number column

Remove the PRB prefix from the Number column and convert it to a number

In [None]:
data['Number'] = data['Number'].map(lambda x: x.lstrip('PRB'))
data['Number'] = pd.to_numeric(data['Number'])
data

In [None]:
data.dtypes

### Priority, Impact, and Urgency

Take the first character (number) from each and convert to numeric

In [None]:
data['Priority'] = data['Priority'].map(lambda x: x[0])
data['Priority'] = pd.to_numeric(data['Priority'])

data['Impact'] = data['Impact'].map(lambda x: x[0])
data['Impact'] = pd.to_numeric(data['Impact'])

data['Urgency'] = data['Urgency'].map(lambda x: x[0])
data['Urgency'] = pd.to_numeric(data['Urgency'])

data.head()

In [None]:
data.dtypes

### Categorical Fields

Check out category counts

In [None]:
data['Business service'].value_counts()

In [None]:
data['Category'].value_counts()

In [None]:
data['Company'].value_counts()

In [None]:
data['Problem Manager'].value_counts()

In [None]:
data['Type'].value_counts()

In [None]:
data['Assignment group'].value_counts()

Cut categories with low counts and add together to make an "Other" category

In [None]:
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name=='Business service' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name=='Company' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name=='Problem Manager' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name=='Type' else x)
data = data.apply(lambda x: x.mask(x.map(x.value_counts())<10, 'Other') if x.name=='Assignment group' else x)
data.head()

Convert categorical fields to be a category type

In [None]:
data['Business service'] = data['Business service'].astype('category')
data['Category'] = data['Category'].astype('category')
data['Company'] = data['Company'].astype('category')
data['Problem Manager'] = data['Problem Manager'].astype('category')
data['Type'] = data['Type'].astype('category')
data['Assignment group'] = data['Assignment group'].astype('category')
data.head()

In [None]:
data.dtypes

### Created

Convert the Created column to a datetime type

In [None]:
data['Created'] = data['Created'].astype('datetime64[ns]')
data.head()

In [None]:
data.dtypes

### Created by

Convert the Created by column to a number

In [None]:
data['Created by'] = pd.to_numeric(data['Created by'])
data.head()

Looks like there may be some that have "admin" before the id number. I am just going to remove this prefix.

In [None]:
data['Created by'] = data['Created by'].map(lambda x: x.lstrip('admin'))
data['Created by'] = pd.to_numeric(data['Created by'])
data.head()

In [None]:
data.dtypes

### Impacted OpCos

Split Impacted OpCos column into list of OpCos instead of string

In [None]:
data['Impacted OpCos'] = data['Impacted OpCos'].map(lambda x: x.split(","))

Weird.. Let's look for that float 

In [None]:
data[data['Impacted OpCos'].apply(np.isreal)]

Looks like the NaNs are the problem. Convert NaNs to empty strings.

In [None]:
data['Impacted OpCos'] = data['Impacted OpCos'].replace(np.nan, '', regex=True)
data[data['Impacted OpCos'].apply(np.isreal)]

There's no longer any NaNs, so let's try the split again

In [None]:
data['Impacted OpCos'] = data['Impacted OpCos'].map(lambda x: x.split(","))
data.head()

Now we have the list, but we can't do anything with the list. Let's convert to dummy variables.

In [None]:
pd.get_dummies(data['Impacted OpCos'].apply(pd.Series).stack()).sum(level=0).head()

That will take a lot of further processing, so I am going to drop that for now.

In [None]:
data = data.drop('Impacted OpCos', axis=1)
data.head()

In [None]:
data.dtypes

### Parent

Convert Parent column to a boolean Has Parent column

In [None]:
data['Has Parent'] = data['Parent'].map(lambda x: not(np.isreal(x)))
data = data.drop('Parent', axis=1)
data.head()

In [None]:
data.dtypes

Now all columns have a proper data type; let's drop the NaNs and look at our clean data set

In [None]:
data = data.dropna()
data

### Adding categorical time of day feature

In [None]:
from datetime import *
def convert_datetime(ts):
    h = ts.hour
    return ('morning' if 5 <= h < 12
        else 'afternoon' if 12 <= h < 17
        else 'evening' if 17 <= h < 21
        else 'night')

data.insert(loc=6, column='Created Time', value=data['Created'].apply(convert_datetime))
data['Created Time'] = data['Created Time'].astype('category')
data.head()

### Export most recently cleaned data to csv

In [None]:
data.to_csv('problems_cleaned.csv', index=False)

### Clean up dataset with NAs

In [None]:
prob_data = pd.read_csv('problems_cleaned_withna.csv')
prob_data

In [None]:
prob_data.isna()