In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from rake_nltk import Rake

Import data as data frame

In [2]:
# dataPath = "/mnt/c/Users/LJUDY/OneDrive/MyStuff/Work/FedEx/Shift Lead/Clustering Project/"
simplefile = "problem_pdsm_simple.csv"

data = pd.read_csv(simplefile)
data.head()

Unnamed: 0,Number,Priority,Business service,Category,Company,Created,Created by,Impact,Impacted OpCos,Major Problem,Parent,Problem Manager,Related Incidents,Type,Urgency
0,PRB0063212,5 - Planning,Server,Hardware,FedEx Services,2019-02-13 08:47:23,5163721,3 - Low,FedEx Services,True,INC011242667,Christopher Barber,1,ITCC,3 - Low
1,PRB0063211,5 - Planning,Database,Application,,2019-02-13 08:49:50,3619116,3 - Low,FedEx Express APAC,False,,Sathya Easwaran,0,Standard,3 - Low
2,PRB0063210,5 - Planning,Computer,Hardware,FedEx Services,2019-02-13 06:51:16,5163721,3 - Low,FedEx Ground,True,INC011241950,Divakar Durgapal,1,ITCC,3 - Low
3,PRB0063205,5 - Planning,Switch,Network,FedEx Services,2019-02-12 22:00:36,3667964,3 - Low,FedEx Ground,True,INC011240331,Christopher Barber,1,ITCC,3 - Low
4,PRB0063204,5 - Planning,Server,Hardware,FedEx Services,2019-02-12 21:03:05,836059,3 - Low,"FedEx Freight Inc., FedEx Office",True,INC011239460,Christopher Barber,1,ITCC,3 - Low


## Set up for modeling

### Split up dependent and independent variables

In [None]:
y = data['Category']
x = data.drop('Category', axis=1)

### Dummy variables for categories

In [None]:
x = pd.get_dummies(x, columns=['Business service', 'Company', 'Problem Manager', 'Type'])
x.head()

## Model using K-means

In [None]:
model = KMeans(n_clusters=8)
model.fit(x)

In [None]:
smallX = x.drop('Created', axis=1)

In [None]:
model = KMeans(n_clusters=8)
model.fit(smallX)

In [None]:
data['kmeans'] = model.labels_

In [None]:
metrics.adjusted_rand_score(data['Category'],data['kmeans'])

In [None]:
pca = PCA()
pca.fit(smallX)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca = PCA(n_components=2)
pca.fit(smallX)

In [None]:
pca.explained_variance_ratio_

In [None]:
transformed = pca.transform(smallX)

In [None]:
km = KMeans(n_clusters=8)
km.fit(transformed)
data['pca'] = km.labels_

In [None]:
metrics.adjusted_rand_score(data['Category'],data['kmeans'])

### Classify using Decision Tree

In [83]:
data = data.drop(['Created', 'Duration Max', 'Duration Min', 'Duration Range',
                  'Short description', 'Keywords'], axis=1)
data.dtypes

Number                  int64
Priority                int64
Business service     category
Category             category
Company              category
Created Time         category
Created by              int64
Impact                  int64
Major Problem            bool
Problem Manager      category
Related Incidents       int64
Type                 category
Urgency                 int64
Incidents Count         int64
Assignment group     category
Duration Mean         float64
Has Parent               bool
dtype: object

In [90]:
data = data.set_index('Number')

In [96]:
X = data.drop('Category', axis=1).values
X

array([[5, 'Server', 'FedEx Services', ..., 'Other', 3346.0, True],
       [5, 'Computer', 'FedEx Services', ..., 'Other', 7250.0, True],
       [5, 'Switch', 'FedEx Services', ..., 'FXS_SPT_GLB_NOCL1', 6856.0,
        True],
       ...,
       [5, 'Database', 'FedEx Services', ..., 'Other', 3885203.846153846,
        True],
       [5, 'Scanner', 'FedEx Services', ..., 'Other', 9075.0, True],
       [5, 'Communication Device', 'FedEx Express', ..., 'Other',
        2474298.0, True]], dtype=object)

In [99]:
le = preprocessing.LabelEncoder()
for col in range(len(X[0])):
    X[:,col] = le.fit_transform(X[:,col])

In [100]:
y = data['Category'].values
y

[Hardware, Hardware, Network, Hardware, Network, ..., Application, Application, Application, Hardware, Hardware]
Length: 688
Categories (8, object): [Application, Environment, Hardware, Infrastructure, Inquiry / Help, Network, Security, Telephony]

In [115]:
y = le.fit_transform(y)

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
tree_gini = DecisionTreeClassifier(criterion='gini',random_state=100, max_depth=5, min_samples_leaf=5)
tree_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [119]:
tree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=5, min_samples_leaf=5)
tree_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [122]:
y_pred = tree_gini.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 5, 5, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 5, 0, 0, 2, 0, 2, 0, 0])

In [123]:
y_pred_entropy = tree_entropy.predict(X_test)
y_pred_entropy

array([0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0, 0, 0, 0, 0, 0,
       0, 5, 5, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 2, 5, 0, 0, 0, 5, 0,
       0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 5, 0, 0, 0, 0, 0, 0, 0])

In [124]:
accuracy_score(y_test, y_pred)

0.6859903381642513