# Setup

Let's import some useful libs and configure the basics parameters.

Then, we need to import the CSV files into datasets.

In [None]:
import pandas as pd  # to create the datasets
import matplotlib.pyplot as plt  # to plot graphics

In [None]:
# Defining teh default options for our plots
%matplotlib inline
plt.rcParams['figure.figsize'] = (18,6)

Importing the files into CSV files and checking the first lines:

In [None]:
vmstat = pd.read_csv('./vmstat.csv')
vmstat.head()

In [None]:
pidstat = pd.read_csv('./pidstat.csv')
pidstat.head()

# Exploring the datasets

We have to take a look on both datasets and identify possible missing values, importing errors or other strange behaviors and understand each feature.

The pidstat dataset has a Time column in Unix Epoch format. It is necessary to convert to standard time.

In [None]:
print('Datasets Shapes\n' + '-' * 20)
for ds in ['pidstat', 'vmstat']:
    print(ds, eval(ds).shape)

In [None]:
vmstat['datetime'] = pd.to_datetime(vmstat['date'].astype(str) + ' ' + vmstat['time'])
vmstat['datetime'] = vmstat['datetime'].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
vmstat['datetime'] = vmstat['datetime'] + pd.Timedelta('03:00:00')
print(vmstat['datetime'].dtypes)
vmstat.head()

In [None]:
pidstat['Time'] = pd.to_datetime(pidstat['Time'], unit='s', origin='unix')
pidstat['Time'] = pidstat['Time'].dt.tz_localize('UTC').dt.tz_convert('America/Sao_Paulo')
print(pidstat['Time'].dtypes)
pidstat.head()

# Studying Pidstat

In [None]:
pidstat.Command.describe()

In [None]:
# Top 15 most frequent commands
pidstat.Command.value_counts()[:15,]

In [None]:
# What is the most intense process on kernel ring?
# Let's calculate the average Kernel CPU usage for each command and
# print a list with the TOP 5
g_pidstat = pidstat.groupby('Command')

top5_kernel = g_pidstat['%system'].mean().sort_values(ascending=False)[:5,]
print(top5_kernel)

fig, ax = plt.subplots()
x_pos = pd.np.arange(5)
ax.bar(x_pos, top5_kernel.values)
ax.set_xticks(x_pos)
ax.set_xticklabels(list(top5_kernel.index))
plt.show()

In [None]:
# And the Top 5 process consiming resources on User ring

top5_user = g_pidstat['%usr'].mean().sort_values(ascending=False)[:5,]
print(top5_user)

fig, ax = plt.subplots()
x_pos = pd.np.arange(5)
ax.bar(x_pos, top5_user.values)
ax.set_xticks(x_pos)
ax.set_xticklabels(list(top5_user.index))
plt.show()

# Studying Vmstat

In [None]:
# Let's preview it again to remember the features
vmstat.head()

In [None]:
# I would like to see more details about IO
io_info = vmstat.loc[:, ['dsk_read', 'dsk_write', 'datetime']]
n_rows = len(io_info)

fig, ax = plt.subplots()

ax.plot(io_info['dsk_write'], color='darkred')
ax.plot(io_info['dsk_read'], color='blue', alpha=0.5)

ax.legend()

plt.show()

# Cross Data Checking

The last graph is showing some peaks in read and write.

It would be a good idea to verify the time they occured and lookup the process running.

To acomplish this task we will need to compare data in two different datasets.

In [None]:
# Finding the disk io peaks
top_5_read = io_info.sort_values(by='dsk_read', ascending=False)[:5]
top_5_write = io_info.sort_values(by='dsk_write', ascending=False)[:5]
print(top_5_read, '\n\n', top_5_write)

In [None]:
reads = pidstat.loc[pidstat['Time'].isin(top_5_read['datetime'])]
writes = pidstat.loc[pidstat['Time'].isin(top_5_write['datetime'])]

In [None]:
reads.sort_values(by=['%wait','%CPU'], ascending=False)[:5]

In [None]:
writes.sort_values(by=['%wait','%CPU'], ascending=False)[:5]