In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn
from matplotlib import pyplot
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from rake_nltk import Rake

Import data as data frame

In [None]:
# dataPath = "/mnt/c/Users/LJUDY/OneDrive/MyStuff/Work/FedEx/Shift Lead/Clustering Project/"
simplefile = "problems_preprocessed.csv"

data = pd.read_csv(simplefile)
data.head()

In [None]:
data.columns

## Exploratory Data Analysis

In [None]:
seaborn.catplot(x="Type", y="Impact", hue="Category", kind="bar", data=data);

In [None]:
rcParams.update({'figure.autolayout': True})
seaborn.set(font_scale=3)
fig, ax = plt.subplots(2,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(30)
fig.set_figheight(20)
# plt.set_yscale('log')
g1 = seaborn.boxplot('Duration Mean', 'Category', data=data, ax=ax[0])
g1.set_xscale('log')
g1.set_title('Problem Duration by Category and Business service')
g2 = seaborn.boxplot('Duration Mean', 'Business service', data=data, ax=ax[1])
g2.set_xscale('log')
plt.savefig("DurationByCat-Service-logscale.png")

In [None]:
seaborn.set(font_scale=2)
fig, ax = plt.subplots()
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(28)
fig.set_figheight(12)
g = seaborn.boxplot('Duration Mean', 'Problem Manager', data=data)
# g.set_xscale('log')
plt.title('Durations by Problem Manager')
plt.savefig("DurationByManager.png")

In [None]:
seaborn.set(font_scale=3)
fig, ax = plt.subplots(2,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(30)
fig.set_figheight(20)
g1 = seaborn.boxplot('Updates', 'Category', data=data, ax=ax[0])
g1.set_title('Update Count by Category and Business service')
g2 = seaborn.boxplot('Updates', 'Business service', data=data, ax=ax[1])
plt.savefig("UpdatesByCat-Service.png")

In [None]:
seaborn.set(font_scale=2)
fig, ax = plt.subplots()
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(28)
fig.set_figheight(12)
g = seaborn.boxplot('Updates', 'Problem Manager', data=data)
# g.set_xscale('log')
plt.title('Updates by Problem Manager')
plt.savefig("UpdatesByManager.png")

In [None]:
seaborn.set(font_scale=3)
fig, ax = plt.subplots(2,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(30)
fig.set_figheight(20)
g1 = seaborn.boxplot('Reassignment count', 'Category', data=data, ax=ax[0])
g1.set_title('Reassignment Count by Category and Business service')
g2 = seaborn.boxplot('Reassignment count', 'Business service', data=data, ax=ax[1])
plt.savefig("ReassignCountByCat-Service.png")

In [None]:
data['Problem Manager'].value_counts()

In [None]:
seaborn.set(font_scale=2)

In [None]:
fig, ax = plt.subplots(2,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.barplot('Duration Mean', 'Business service', errwidth=0, data=data, ax=ax[0])
seaborn.barplot('Duration Mean', 'Category', errwidth=0, data=data, ax=ax[1])
# seaborn.barplot('Duration Mean', 'Problem Manager', errwidth=0, data=data, ax=ax[0,0])
# seaborn.barplot('Duration Mean', 'Assignment group', errwidth=0, data=data, ax=ax[1,0])
# seaborn.barplot('Duration Mean', 'Company', errwidth=0, data=data, ax=ax[2,0])
# seaborn.barplot('Duration Mean', 'Created Time', errwidth=0, data=data, ax=ax[2,1])

In [None]:
fig, ax = plt.subplots(2,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.barplot('Duration Mean', 'Problem Manager', errwidth=0, data=data, ax=ax[0])
seaborn.barplot('Duration Mean', 'Assignment group', errwidth=0, data=data, ax=ax[1])
# seaborn.barplot('Duration Mean', 'Company', errwidth=0, data=data, ax=ax[2,0])
# seaborn.barplot('Duration Mean', 'Created Time', errwidth=0, data=data, ax=ax[2,1])

In [None]:
fig, ax = plt.subplots(1,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.set_style("dark")
x1 = list(data[data['Category'] == 'Application']['Duration Mean'])
x2 = list(data[data['Category'] == 'Network']['Duration Mean'])
x3 = list(data[data['Category'] == 'Hardware']['Duration Mean'])
x4 = list(data[data['Category'] == 'Environment']['Duration Mean'])
x5 = list(data[data['Category'] == 'Security']['Duration Mean'])
x6 = list(data[data['Category'] == 'Infrastructure']['Duration Mean'])
x7 = list(data[data['Category'] == 'Inquiry / Help']['Duration Mean'])
x8 = list(data[data['Category'] == 'Telephony']['Duration Mean'])
x9 = list(data[data['Category'] == 'Other']['Duration Mean'])


# Assign colors for each airline and the names
colors = ['#0039A6', '#FF6319', '#6CBE45', '#996633', '#FCCC0A', '#EE352E', '#B933AD', '#00A1DE', '#00933C']
names = ['Application', 'Network', 'Hardware', 'Environment', 'Security', 
         'Infrastructure', 'Inquiry / Help', 'Telephony', 'Other']
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names
plt.hist([x1, x2, x3, x4, x5, x6, x7, x8, x9], bins = int(180/30), normed=False,
         color = colors, label=names)

plt.legend()
plt.xlabel('Duration Mean')
plt.ylabel('Normalized Count')
plt.title('Duration by Category (Normed)')
plt.savefig("DurationByCategory.png")

In [None]:
fig, ax = plt.subplots(1,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.set_style("dark")
x1 = list(data[data['Category'] == 'Application']['Reassignment count'])
x2 = list(data[data['Category'] == 'Network']['Reassignment count'])
x3 = list(data[data['Category'] == 'Hardware']['Reassignment count'])
x4 = list(data[data['Category'] == 'Environment']['Reassignment count'])
x5 = list(data[data['Category'] == 'Security']['Reassignment count'])
x6 = list(data[data['Category'] == 'Infrastructure']['Reassignment count'])
x7 = list(data[data['Category'] == 'Inquiry / Help']['Reassignment count'])
x8 = list(data[data['Category'] == 'Telephony']['Reassignment count'])
x9 = list(data[data['Category'] == 'Other']['Reassignment count'])


# Assign colors for each airline and the names
colors = ['#0039A6', '#FF6319', '#6CBE45', '#996633', '#FCCC0A', '#EE352E', '#B933AD', '#00A1DE', '#00933C']
names = ['Application', 'Network', 'Hardware', 'Environment', 'Security', 
         'Infrastructure', 'Inquiry / Help', 'Telephony', 'Other']
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names
plt.hist([x1, x2, x3, x4, x5, x6, x7, x8, x9], bins = int(180/30), normed=False,
         color = colors, label=names)

plt.legend()
plt.xlabel('Reassignment Count')
plt.ylabel('Count')
plt.title('Reassignment Count')
plt.savefig("ReassignmentCountByCategory.png")

In [None]:
fig, ax = plt.subplots(1,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.set_style("dark")

categories = ['Application', 'Network', 'Hardware', 'Environment', 'Security', 
         'Infrastructure', 'Inquiry / Help', 'Telephony']

colors = ['#0039A6', '#FF6319', '#6CBE45', '#996633', '#FCCC0A', '#EE352E', '#B933AD', '#00A1DE', '#00933C']

for cat in categories:
    # Subset to the airline
    subset = data[data['Category'] == cat]
    
    # Draw the density plot
    seaborn.distplot(subset['Updates'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = cat)
    
# Plot formatting


# Assign colors for each airline and the names
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names

plt.legend(prop={'size': 16}, title = 'Category')
plt.xlabel('Num Updates')
plt.ylabel('Count')
plt.title('Count of Updates by Category')
plt.savefig("UpdatesByCategory.png")

In [None]:
fig, ax = plt.subplots(1,1)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.set_style("dark")

categories = ['Application', 'Network', 'Hardware', 'Environment', 'Security', 
         'Infrastructure', 'Inquiry / Help', 'Telephony']

colors = ['#0039A6', '#FF6319', '#6CBE45', '#996633', '#FCCC0A', '#EE352E', '#B933AD', '#00A1DE', '#00933C']

for cat in categories:
    # Subset to the airline
    subset = data[data['Category'] == cat]
    
    # Draw the density plot
    seaborn.distplot(subset['Reassignment count'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = cat)
    
# Plot formatting


# Assign colors for each airline and the names
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names

plt.legend(prop={'size': 16}, title = 'Category')
plt.xlabel('Num Reassignments')
plt.ylabel('Count')
plt.title('Reassignment Count by Category')
plt.savefig("ReassignCountByCategory.png")

In [None]:
fig, ax = plt.subplots(3,2)
plt.subplots_adjust(wspace=.5, hspace=.2)
fig.set_figwidth(20)
fig.set_figheight(15)
seaborn.set_style("dark")
seaborn.violinplot('Duration Mean', 'Business service', errwidth=0, data=data, ax=ax[0,0])
seaborn.violinplot('Duration Mean', 'Category', errwidth=0, data=data, ax=ax[0,1])
seaborn.violinplot('Duration Mean', 'Problem Manager', errwidth=0, data=data, ax=ax[1,0])
seaborn.violinplot('Duration Mean', 'Assignment group', errwidth=0, data=data, ax=ax[1,1])
seaborn.violinplot('Duration Mean', 'Company', errwidth=0, data=data, ax=ax[2,0])
seaborn.violinplot('Duration Mean', 'Created Time', errwidth=0, data=data, ax=ax[2,1])

In [None]:
corr = data.corr()
seaborn.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)