In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from rake_nltk import Rake

Import data as data frame

In [41]:
data = pd.read_csv('problem_pdsm_simpleclean.csv')
data

Unnamed: 0,Number,Priority,Business service,Category,Company,Created,Created Time,Created by,Impact,Major Problem,...,Urgency,Short description,Keywords,Incidents Count,Assignment group,Duration Max,Duration Mean,Duration Min,Duration Range,Has Parent
0,63212,5,Server,Hardware,FedEx Services,2019-02-13 08:47:23,morning,5163721,3,True,...,3,CANCELLED - Lync issue that cleared with no kn...,"['short duration', 'lync issue', 'know interve...",1,Other,3346,3.346000e+03,3346,0,True
1,63210,5,Computer,Hardware,FedEx Services,2019-02-13 06:51:16,morning,5163721,3,True,...,3,Ground locations are no longer reporting issue...,['longer reporting issues seeing vsa records w...,1,Other,7250,7.250000e+03,7250,0,True
2,63205,5,Switch,Network,FedEx Services,2019-02-12 22:00:36,night,3667964,3,True,...,3,CANCELLED - local power outage .,['local power outage'],1,FXS_SPT_GLB_NOCL1,6856,6.856000e+03,6856,0,True
3,63204,5,Server,Hardware,FedEx Services,2019-02-12 21:03:05,night,836059,3,True,...,3,CANCELLED - issue cleared following a DB proce...,"['issue cleared following', 'db process comple...",1,Other,11000,1.100000e+04,11000,0,True
4,63201,5,Switch,Network,FedEx Ground,2019-02-12 07:58:23,morning,5163721,3,True,...,3,Cancelled -Local power issue,['local power issue'],1,FXS_SPT_GLB_NOCL1,3943,3.943000e+03,3943,0,True
5,63200,3,Computer,Application,FedEx Express,2019-02-12 06:07:53,morning,5163721,2,True,...,2,Multiple FXE locations experienced slow respon...,"['ground delivery routes ).', 'hung roads gdr'...",1,Other,12811,1.281100e+04,12811,0,True
6,63195,5,Switch,Network,FedEx Freight Inc.,2019-02-12 04:42:01,night,5163721,3,True,...,3,Cancelled -Local power issue,['local power issue'],1,FXS_SPT_GLB_NOCL2,3895,3.895000e+03,3895,0,True
7,63194,3,Switch,Network,FedEx Ground,2019-02-12 03:16:19,night,3667964,3,True,...,1,Cancelled -Local power issue,['local power issue'],1,FXS_SPT_GLB_NOCL1,2160,2.160000e+03,2160,0,True
8,63187,2,Computer,Hardware,FedEx Express,2019-02-11 22:03:47,night,5305519,2,True,...,1,CANCELLED - TNT issue,['tnt issue'],1,Other,8794,8.794000e+03,8794,0,True
9,63173,5,Scanner,Hardware,FedEx Ground,2019-02-11 14:23:59,afternoon,973921,3,True,...,3,CANCELLED - single site issue,['single site issue'],1,Other,346919,3.469190e+05,346919,0,True


## Set up for modeling

### Split up dependent and independent variables and get dummy variables for categorical features

In [3]:
data = data.drop(['Created', 'Duration Max', 'Duration Min', 'Duration Range',
                  'Short description', 'Keywords'], axis=1)
data.dtypes

Number                 int64
Priority               int64
Business service      object
Category              object
Company               object
Created Time          object
Created by             int64
Impact                 int64
Major Problem           bool
Problem Manager       object
Related Incidents      int64
Type                  object
Urgency                int64
Incidents Count        int64
Assignment group      object
Duration Mean        float64
Has Parent              bool
dtype: object

In [4]:
data = data.set_index('Number')
data.shape

(688, 16)

In [23]:
X = pd.get_dummies(data.drop('Category', axis=1))
X_df = pd.get_dummies(data.drop('Category', axis=1))
feature_list = list(X.columns)
X = X.values
X

array([[5, 5163721, 3, ..., 0, 0, 1],
       [5, 5163721, 3, ..., 0, 0, 1],
       [5, 3667964, 3, ..., 0, 0, 0],
       ...,
       [5, 5305519, 3, ..., 0, 0, 1],
       [5, 483039, 3, ..., 0, 0, 1],
       [5, 572652, 3, ..., 0, 0, 1]], dtype=object)

In [6]:
le = preprocessing.LabelEncoder()
# for col in range(len(X[0])):
#     X[:,col] = le.fit_transform(X[:,col])

In [7]:
y = data['Category'].values
y

array(['Hardware', 'Hardware', 'Network', 'Hardware', 'Network',
       'Application', 'Network', 'Network', 'Hardware', 'Hardware',
       'Hardware', 'Network', 'Hardware', 'Application', 'Network',
       'Network', 'Hardware', 'Network', 'Network', 'Network',
       'Application', 'Application', 'Hardware', 'Hardware', 'Hardware',
       'Application', 'Application', 'Application', 'Hardware',
       'Application', 'Application', 'Application', 'Application',
       'Hardware', 'Application', 'Telephony', 'Network', 'Network',
       'Application', 'Hardware', 'Hardware', 'Network', 'Hardware',
       'Hardware', 'Application', 'Hardware', 'Hardware', 'Application',
       'Application', 'Application', 'Hardware', 'Application', 'Network',
       'Application', 'Hardware', 'Application', 'Hardware',
       'Application', 'Application', 'Hardware', 'Hardware',
       'Application', 'Application', 'Application', 'Hardware',
       'Hardware', 'Hardware', 'Application', 'Application',

In [8]:
labeldict = dict(zip(le.fit_transform(y), y))
# labeldict
y = le.fit_transform(y)
y

array([2, 2, 5, 2, 5, 0, 5, 5, 2, 2, 2, 5, 2, 0, 5, 5, 2, 5, 5, 5, 0, 0,
       2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 7, 5, 5, 0, 2, 2, 5, 2, 2,
       0, 2, 2, 0, 0, 0, 2, 0, 5, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 2,
       2, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5,
       5, 0, 0, 2, 0, 0, 3, 5, 0, 5, 0, 0, 0, 0, 0, 0, 5, 3, 5, 3, 3, 0,
       2, 0, 0, 0, 2, 0, 5, 3, 5, 0, 0, 0, 6, 3, 0, 0, 3, 2, 0, 2, 2, 0,
       2, 0, 2, 0, 3, 0, 0, 0, 5, 0, 2, 5, 0, 2, 0, 2, 5, 2, 0, 0, 5, 0,
       3, 0, 5, 5, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 3, 0, 0, 0, 0, 2, 2,
       5, 3, 2, 0, 0, 0, 3, 0, 0, 0, 2, 0, 2, 0, 2, 2, 3, 2, 0, 0, 0, 2,
       0, 7, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 0, 3, 5, 7, 0, 0, 0, 5, 2, 0,
       0, 0, 0, 0, 0, 5, 0, 0, 4, 2, 0, 5, 0, 5, 2, 0, 0, 6, 6, 7, 0, 0,
       3, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 2, 0, 0, 6, 0, 4, 0, 0, 0,
       0, 6, 0, 0, 0, 0, 6, 0, 0, 0, 6, 0, 4, 6, 0,

## Model using K-means

In [29]:
model = KMeans(n_clusters=8)
model.fit(X_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [None]:
# smallX = X.drop('Created', axis=1)

In [None]:
model = KMeans(n_clusters=8)
model.fit(smallX)

In [30]:
data['kmeans'] = model.labels_

In [33]:
metrics.adjusted_rand_score(data['Category'],data['kmeans'])

-0.02662341361869613

In [34]:
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [35]:
pca.explained_variance_ratio_

array([7.72702592e-01, 2.27297408e-01, 7.60421708e-13, 3.87265621e-13,
       2.75892879e-13, 2.61860223e-13, 7.91508927e-14, 6.82742726e-14,
       6.02135331e-14, 5.48786110e-14, 5.31515674e-14, 5.02539575e-14,
       4.49665394e-14, 4.09177693e-14, 3.43518806e-14, 3.12427751e-14,
       2.71722751e-14, 2.46325157e-14, 2.26646733e-14, 2.18352142e-14,
       1.98607886e-14, 1.88604316e-14, 1.68159711e-14, 1.62565198e-14,
       1.51024215e-14, 1.47511190e-14, 1.25082958e-14, 1.24127337e-14,
       1.18378433e-14, 9.53320188e-15, 8.03356200e-15, 7.73231817e-15,
       6.87291262e-15, 6.31965335e-15, 5.54390311e-15, 4.79564213e-15,
       4.63148089e-15, 4.18326853e-15, 3.68869411e-15, 3.55650818e-15,
       3.32181036e-15, 3.19688741e-15, 3.07322192e-15, 2.85820304e-15,
       2.64383023e-15, 2.08172553e-15, 1.87682762e-15, 1.59251896e-15,
       4.73512907e-16, 7.64971724e-33, 7.64971724e-33, 7.64971724e-33,
       7.64971724e-33, 7.64971724e-33, 7.64971724e-33, 8.43623662e-37])

In [36]:
pca = PCA(n_components=2)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [37]:
pca.explained_variance_ratio_

array([0.77270259, 0.22729741])

In [27]:
transformed = pca.transform(X_df)

ValueError: operands could not be broadcast together with shapes (688,56) (2,) 

In [40]:
pca = PCA(n_components=2)
pca.fit_transform(X_df)

# Dump components relations with features:
pd.DataFrame(pca.components_,columns=X_df.columns,index = ['PC-1','PC-2']).iloc[0]

Priority                                      -3.430419e-09
Created by                                    -9.985766e-01
Impact                                        -2.173450e-08
Major Problem                                 -2.227854e-08
Related Incidents                              4.119736e-09
Urgency                                        1.830408e-08
Incidents Count                                6.950308e-09
Duration Mean                                  5.333714e-02
Has Parent                                    -3.455071e-09
kmeans                                         5.409055e-07
pca                                            2.177902e-07
Business service_Communication Device         -1.729649e-08
Business service_Computer                      6.812673e-09
Business service_Database                      2.353892e-09
Business service_Network                       1.817635e-08
Business service_Other                         4.596233e-09
Business service_Scanner                

In [19]:
km = KMeans(n_clusters=8)
km.fit(transformed)
data['pca'] = km.labels_

In [24]:
metrics.adjusted_rand_score(y,data['pca'])

-0.02662341361869613

### Classify using Decision Tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
tree_gini = DecisionTreeClassifier(criterion='gini',random_state=100, max_depth=5, min_samples_leaf=5)
tree_gini.fit(X_train, y_train)

In [None]:
tree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=5, min_samples_leaf=5)
tree_entropy.fit(X_train, y_train)

In [None]:
y_pred = tree_gini.predict(X_test)
y_pred

In [None]:
y_pred_entropy = tree_entropy.predict(X_test)
y_pred_entropy

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred_entropy)

In [None]:
rf  = RandomForestClassifier(n_estimators = 1000, random_state=42)
rf.fit(X_train, y_train)

In [None]:
pred = rf.predict(X_test)
pred

In [None]:
accuracy_score(y_test, pred)