In [81]:
# Copyright (c) 2018 CA.  All rights reserved.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib notebook 

# load x'07' data from filesystem into dataframe
hdf = pd.HDFStore('hdf5/logs07_only.h5')
log07 = hdf['df']

In [149]:
# I'll add two new columns for vsam and osam total io
log07['totvsamio'] = log07['vsamrio'] + log07['vsamwio'] 
log07['totosamio'] = log07['osamrio'] + log07['osamwio']

# another new columns will tell either this is an osam or vsam, some will be none
# numbers 0,1,2,3 are for future calculations
log07['access'] = 0      
log07.loc[log07['totvsamio'] > 0, 'access'] = 1
log07.loc[log07['totosamio'] > 0, 'access'] = 2
log07.loc[(log07['totvsamio'] > 0) & (log07['totosamio'] > 0), 'access'] = 3

# drop all non dli applications
dliapps = log07[log07['dlicnt'] > 0]

# show me frequency of values
dliapps['access'].value_counts()

In [150]:
# plotting a scatter matrix
from matplotlib import cm
X = dliapps[['extime', 'dlicnt', 'totio', 'lktime']]
X = X.astype(np.float)
y = dliapps['access']

scatter = pd.plotting.scatter_matrix(X, c=y, marker = 'o', s=20, hist_kwds={'bins':25}, figsize=(9,9))

In [152]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# make a 3d scatter plot
ax.scatter(X['lktime'], X['extime'], X['dlicnt'], c=y) 
ax.set_xlabel('lktime')
ax.set_ylabel('extime')
ax.set_zlabel('dlicnt');

In [153]:
dliapps[dliapps['lktime'] == (dliapps['lktime'].max())].iloc[0]

In [131]:
from sklearn.model_selection import train_test_split

# split the data into train and test sets. Default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [141]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)

In [154]:
knn.fit(X_train, y_train)

In [143]:
knn.score(X_test, y_test)

0.8111587982832618