In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import seaborn
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import chi2, SelectKBest, SelectFdr
from sklearn.decomposition import PCA
from rake_nltk import Rake

Import data as data frame

In [2]:
# dataPath = "/mnt/c/Users/LJUDY/OneDrive/MyStuff/Work/FedEx/Shift Lead/Clustering Project/"
simplefile = "problems_cleaned.csv"

data = pd.read_csv(simplefile)
data.head()

Unnamed: 0,Number,Assignment group,Business duration,Business service,Category,Company,Created,Created Time,Created by,Duration Max,...,Major Problem,Opened by,Priority,Problem Manager,Reassignment count,Related Incidents,Short description,Type,Updates,Urgency
0,62159,FSC_ITL3O2STIBCO,0,Computer,Application,Other,2018-12-03 12:44:49,afternoon,Other,622772,...,False,Naveen Kumar,5,Naveen Kumar,1,15,FSC- EIB- A validation error occurred for aler...,Standard,19,3
1,60146,Other,1084035,Computer,Application,FedEx Services,2018-06-28 19:56:05,evening,ID3667964,4636059,...,True,D'Zundra Green,5,Mark Duncan,2,10,Multiple freight locations experienced issues ...,ITCC,32,3
2,60758,FSC_ITL3O2STIBCO,0,Computer,Application,Other,2018-07-13 15:17:09,afternoon,ID5069733,2390,...,False,Admin-Arturo Reyes,5,Naveen Kumar,0,10,FSC - TIBCO : RC672 - Tibco Alert - Failed to...,Standard,33,3
3,60985,Other,576000,Server,Application,FedEx Services,2018-07-15 18:14:43,evening,ID5305519,2406878,...,True,Ashish Bisht,5,Christophe Gurley,2,9,FedEx Office center users at multiple centers ...,ITCC,24,3
4,60147,Other,562179,Network,Application,FedEx Services,2018-06-29 07:15:36,morning,Other,2197479,...,True,Brad Moore,5,Michael Kennemer,2,6,"Retail Phones, Payment switch and corporate pa...",ITCC,27,3


## Set up for modeling

### Split up dependent and independent variables

In [None]:
y = data['Category']
x = data.drop('Category', axis=1)

### Dummy variables for categories

In [None]:
x = pd.get_dummies(x, columns=['Business service', 'Company', 'Problem Manager', 'Type'])
x.head()

## Model using K-means

In [18]:
model = KMeans(n_clusters=9)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [None]:
smallX = x.drop('Created', axis=1)

In [None]:
model = KMeans(n_clusters=8)
model.fit(smallX)

In [19]:
data['kmeans'] = model.labels_

In [32]:
model.labels_

array([2, 1, 0, ..., 0, 0, 0], dtype=int32)

In [33]:
metrics.adjusted_rand_score(y, model.labels_)

-0.012224344225801903

In [23]:
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [24]:
pca.explained_variance_ratio_

array([9.74904931e-01, 2.28033004e-02, 6.70559680e-04, 5.67028826e-04,
       3.91984869e-04, 2.79413023e-04, 2.08646332e-04, 1.20456973e-04,
       1.78493847e-05, 1.27163270e-05, 8.50393994e-06, 6.36428280e-06,
       3.36945110e-06, 2.43436435e-06, 1.58157306e-06, 6.12045556e-07,
       2.47343094e-07, 8.02892330e-36])

In [None]:
pca = PCA(n_components=2)
pca.fit(smallX)

In [None]:
pca.explained_variance_ratio_

In [None]:
transformed = pca.transform(smallX)

In [None]:
km = KMeans(n_clusters=8)
km.fit(transformed)
data['pca'] = km.labels_

In [None]:
metrics.adjusted_rand_score(data['Category'],data['kmeans'])

### Classify using Decision Tree

In [3]:
data = data.drop(['Created', 'Duration Max', 'Duration Min', 'Duration Range',
                  'Short description', 'Keywords - Short Desc'], axis=1)
data.dtypes

Number                  int64
Assignment group       object
Business duration       int64
Business service       object
Category               object
Company                object
Created Time           object
Created by             object
Duration Mean         float64
Has Parent               bool
Impact                  int64
Major Problem            bool
Opened by              object
Priority                int64
Problem Manager        object
Reassignment count      int64
Related Incidents       int64
Type                   object
Updates                 int64
Urgency                 int64
dtype: object

In [4]:
for col in data:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

In [5]:
data = data.drop('Number', axis=1)

In [6]:
data.to_csv('problems_preprocessed.csv', index=False)

In [7]:
data = data.set_index('Number')

KeyError: 'Number'

In [8]:
X = data.drop('Category', axis=1).values
X

array([['FSC_ITL3O2STIBCO', 0, 'Computer', ..., 'Standard', 19, 3],
       ['Other', 1084035, 'Computer', ..., 'ITCC', 32, 3],
       ['FSC_ITL3O2STIBCO', 0, 'Computer', ..., 'Standard', 33, 3],
       ...,
       ['FXO_SDESK_GLB_ProblemMgmt', 0, 'Other', ..., 'Standard', 11, 3],
       ['FXS_EIS_GLB_ProblemMgmt', 0, 'Other', ..., 'ITCC', 7, 3],
       ['Other', 0, 'Database', ..., 'Standard', 1, 3]], dtype=object)

In [9]:
le = preprocessing.LabelEncoder()
for col in range(len(X[0])):
    X[:,col] = le.fit_transform(X[:,col])

In [10]:
y = data['Category'].values
y

[Application, Application, Application, Application, Application, ..., Application, Application, Application, Application, Application]
Length: 1857
Categories (9, object): [Application, Environment, Hardware, Infrastructure, ..., Network, Other, Security, Telephony]

In [11]:
y = le.fit_transform(y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
tree_gini = DecisionTreeClassifier(criterion='gini',random_state=100, max_depth=5, min_samples_leaf=5)
tree_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [13]:
tree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=5, min_samples_leaf=5)
tree_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [14]:
y_pred = tree_gini.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 6, 6, 2, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 6, 0, 6, 0, 0, 0, 0, 0, 6, 6, 2, 0, 0, 6, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 6, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0,
       0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 2, 0, 0, 0, 2, 0, 6, 6, 0, 2, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 6, 0, 0, 0,
       0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0,
       0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 6, 0, 0, 0, 0,
       0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 0,

In [15]:
y_pred_entropy = tree_entropy.predict(X_test)
y_pred_entropy

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 0, 2, 0, 0, 0, 2, 6, 6, 2, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 6, 0, 6, 0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 6, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 6, 2, 0, 0, 6, 6, 0, 0, 2, 0, 0, 6, 0, 0, 0, 0, 0, 0,
       0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 2, 6, 2, 0,
       0, 0, 2, 0, 0, 0, 6, 0, 6, 0, 2, 0, 0, 0, 2, 0, 6, 6, 0, 2, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 6, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 6, 0, 2, 2, 0, 0, 0, 0, 6, 0, 0, 2, 6, 0, 0, 0,
       0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0,
       2, 0, 2, 0, 6, 0, 0, 2, 2, 0, 0, 0, 6, 0, 0, 0, 0, 6, 0, 0, 0, 0,
       0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 0,

In [16]:
accuracy_score(y_test, y_pred)

0.7078853046594982

In [17]:
accuracy_score(y_test, y_pred_entropy)

0.7043010752688172