# INFN-MILANO-ATLASC_LOCALGROUPDISK

In [None]:
#! pip install plotly --upgrade
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.express as px
init_notebook_mode()

In [None]:
import numpy as np
import pandas as pd
# WARNING for pandas >=0.17: https://github.com/pydata/pandas/issues/11786
import logging
import datetime
import urllib

from IPython.display import display, HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['image.cmap'] = 'coolwarm'  # quite good colormap, should avoid rainbow problems

In [None]:
# TODO: solve this
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
Gb = 1024. ** 3
Tb = 1024. ** 4

## Import data

In [None]:
def get_data(rse, date, **kwargs):
    datestr = date.strftime('%d-%m-%Y')
    url = "https://rucio-hadoop.cern.ch/consistency_datasets?rse=%s&date=%s" % (rse, datestr)
    print(url)
    return to_pandas(url)

In [None]:
def to_pandas(filename):
    def conv(s):
        s = set(s.split(","))
        to_remove = "panda", "root"
        for name in to_remove:
            if name in s:
                s.remove(name)
        s = ",".join(s)
        return s

    try:
        names = (
            "RSE",
            "scope",
            "name",
            "owner",
            "size",
            "creation_date",
            "last_accessed_date",
            "rule_id",
            "n_replicas",
            "update_date",
        )
        data = pd.read_csv(
            filename,
            sep="\t",
            header=None,
            parse_dates=["creation_date", "last_accessed_date", "update_date"],
            date_parser=lambda _: pd.to_datetime(float(_), unit="ms"),
            converters={"owner": conv},
            names=names,
        )

    except Exception as ex:
        if not isinstance(ex, urllib.error.HTTPError):
            print(
                ("cannot parse file from %s: %s" % (filename, str(ex.msg)))
            )  # pylint: disable=E1101

        raise

    return data


In [None]:
milano_rse = "INFN-MILANO-ATLASC_LOCALGROUPDISK"
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
date = yesterday

data = get_data(milano_rse, date)

In [None]:
data2 = get_data(milano_rse, datetime.datetime.now() - datetime.timedelta(days=200))
data = data.set_index(["RSE", "scope", "name"])

data2 = data2.set_index(["RSE", "scope", "name"])


In [None]:
full_df = pd.DataFrame({"RSE": [], "scope": [], "name": [], "owner": [], "size": [], "creation_date": [], "last_accessed_date": [], "rule_id": [], "n_replicas": [], "update_date": []})
full_df = full_df.set_index(["RSE", "scope", "name"])


index_missing = data.index.difference(full_df.index)
index_common = data.index.intersection(full_df.index)
full_df = pd.concat([full_df, data.loc[index_missing]]).sort_index()
full_df.loc[index_common] = data.loc[index_common]

index_missing = data2.index.difference(full_df.index)
index_common = data2.index.intersection(full_df.index)
full_df = pd.concat([full_df, data2.loc[index_missing]]).sort_index()
full_df.loc[index_common] = data2.loc[index_common]

In [None]:
data2.loc[index_common].loc[(data.loc[index_common] != data2.loc[index_common])['creation_date'].values].head(2)


In [None]:
index_common = data2.index.intersection(data.index)

data.loc[index_common].loc[(data.loc[index_common] != data2.loc[index_common])['creation_date'].values]


### Print the first datasets

In [None]:
print(len(data))
data.head()

In [None]:
data.to_json("data_all.json", orient='records', date_format='iso')

In [None]:
from pandas.io.pytables import HDFStore
store = HDFStore('store.h5')
store[str(date)] = data
store.close()
store

In [None]:
data[data['owner'] == 'resconi']['size'].sum()

In [None]:
def build_hierarchical_dataframe(df, levels, value_column, color_columns=None):
    """
    Build a hierarchy of levels for Sunburst or Treemap charts.

    Levels are given starting from the bottom to the top of the hierarchy,
    ie the last level corresponds to the root.
    """
    df_all_trees = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
    for i, level in enumerate(levels):
        df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
        dfg = df.groupby(levels[i:]).sum()
        dfg = dfg.reset_index()
        df_tree['id'] = dfg[level].copy()
        if i < len(levels) - 1:
            df_tree['parent'] = dfg[levels[i+1]].copy()
        else:
            df_tree['parent'] = 'total'
        df_tree['value'] = dfg[value_column]
        #df_tree['color'] = dfg[color_columns[0]] / dfg[color_columns[1]]
        df_all_trees = pd.concat([df_all_trees, df_tree], ignore_index=True)
    total = pd.Series(dict(id='total', parent='',
                              value=df[value_column].sum(),
                              ))
    df_all_trees = pd.concat([df_all_trees, total], ignore_index=True)
    return df_all_trees

df_2d = data.groupby(['scope', 'owner'])['size'].sum().sort_index().reset_index()


fig = px.sunburst(
    build_hierarchical_dataframe(data.reset_index(), ['scope', 'owner'], 'size'),
    names='id',
    parents='parent',
    values='value',
)
fig.show()

In [None]:
df_2d = data.groupby(['scope', 'owner'])['size'].sum().sort_index()
fig = px.sunburst(df_2d.reset_index(), path=['owner', 'scope'], values='size')
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))

fig.show()


## Total size

In [None]:
total = data['size'].sum() / Tb
print("total usage = %.2f Tb" % total)

## Users sorted by usage

In [None]:
def group_by_owner(data, by='owner'):
    default_actions = {'owner':'count', 'life_days': 'mean', 'age_days': 'mean', 'last_accessed_days': 'mean', 'size': lambda x: np.sum(x) / Tb}
    actions = {k: default_actions[k] for k in data.columns if k in default_actions}
    result = data.groupby(by).agg(actions)
    result = result.rename(columns={'owner': 'ndatasets'})
    return result

group_owner = group_by_owner(data)
group_owner = group_owner.sort_values(['size'], ascending=False)

group_owner

In [None]:
group_scope = group_by_owner(data, 'scope')
group_scope = group_scope.sort_values(['size'], ascending=False)

group_scope

## Size of the datasets

There are many small datasets

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 7))

median = np.median(data['size']) / Gb

data_size_not_null_Gb = data[data['size'].notnull()]['size'] / Gb

data_size_not_null_Gb.hist(bins=np.logspace(-6, 5, 50), ax=ax[0], log=True, histtype='stepfilled')
data_size_not_null_Gb.hist(bins=200, range=(0, 10), ax=ax[1], histtype='stepfilled')
data_size_not_null_Gb[data_size_not_null_Gb < .005].hist(bins=200, ax=ax[2], histtype='stepfilled')
ax[0].set_xscale('log')
ax[0].set_xlabel('dataset size [Gb]')
ax[1].set_xlabel('dataset size [Gb]')


fig.tight_layout()

display((data['size'] / Gb).describe())

print("median dataset = %.2f GB" % (median))
print("fraction dataset < 1 GB = %.1f%%" % (len(data[data['size'] / (1024.**3) < 1]) / float(len(data)) * 100.))
print("fraction dataset < 100 MB = %.1f%%" % (len(data[data['size'] / (1024.**2) < 100]) / float(len(data)) * 100.))

print("5 smallest samples")
display(data[data['size'].notnull()].sort_values('size')[:5][['RSE', 'owner', 'name', 'size']])
print("5 largest samples")
display(data[data['size'].notnull()].sort_values('size')[-5:][['RSE', 'owner', 'name', 'size']])


In [None]:
f, axs = plt.subplots(1, 2, figsize=(18, 4))

g = data.groupby('owner')['size']
owners = data['owner'].unique()

axs[0].hist([g.get_group(user).values / Gb for user in owners], bins=(np.logspace(-5, 3.1, 50)), stacked=True, fill=True, histtype='stepfilled', label=tuple(owners), density=True)
axs[0].legend()
axs[0].set_yscale('log')
axs[0].set_xscale('log')
axs[0].set_xlabel('size [Gb]')


axs[1].hist([g.get_group(user).values / Gb for user in owners], bins=(np.logspace(-5, 3.1, 50)), stacked=True, fill=True, histtype='stepfilled', label=tuple(owners), density=True)
axs[1].legend()
axs[1].set_xscale('log')
axs[1].set_xlabel('size [Gb]')

plt.show()

### Dataset age (when they are created)

In [None]:
data.columns

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
data['age_days'].hist(bins=np.logspace(1, 4, 30), ax=ax[0], histtype='stepfilled')
data['age_days'].hist(bins=30, ax=ax[1], histtype='stepfilled')

ax[0].set_xscale('log')
for a in ax: a.set_xlabel('age [days from today]')
print("average age: %d days" % data.age_days.mean())

In [None]:
for owner, df_owner in data.groupby('owner'):
    print("owner: %s" % owner)
    
    display(df_owner.sort_values('age_days', ascending=False)[['name', 'creation_date', 'state']])

### Dataset last access

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
data['last_accessed_days'].hist(bins=np.logspace(1, 4, 30), ax=ax[0], histtype='stepfilled')
data['last_accessed_days'].hist(bins=30, ax=ax[1], histtype='stepfilled')
ax[0].set_xscale('log')
for a in ax: a.set_xlabel('last accessed [days from today]')
print("average last access: %d days" % data.last_accessed_days.mean())

### Life (time between creation and last access)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
data['life_days'].hist(bins=np.logspace(1, 3, 30), ax=ax[0], histtype='stepfilled')
data['life_days'].hist(bins=50, ax=ax[1], range=(0, 1000), histtype='stepfilled')
ax[0].set_xscale('log')
for a in ax: a.set_xlabel('last accessed - date creation [days]')

print "average life: %d days" % data.life_days.mean()

### Correlations

No correlation between size and age or last access

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 7))
data.plot(kind='scatter', y='size', x='last_accessed_days', ax=ax[0], alpha=0.05)
ax[0].set_yscale('log')

data.plot(kind='scatter', y='size', x='age_days', ax=ax[1], alpha=0.05)
ax[1].set_yscale('log')

In [None]:
corr = data.corr()
corr

In [None]:
fig, ax = plt.subplots()
p = ax.pcolormesh(np.nan_to_num(corr.values), vmin=-1, vmax=1)
ax.set_yticklabels(corr.index)
ax.set_xticklabels(corr.index, rotation=90)
ax.set_yticks(np.arange(len(corr.index)) + 0.5)
ax.set_xticks(np.arange(len(corr.index)) + 0.5)
plt.colorbar(p)

plt.show()

## User analysis

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))
group_owner['size'].plot(kind='pie', autopct='%.0f', ax=ax[0])
group_owner['size'].plot(kind='bar', ax=ax[1])
ax[1].set_ylabel('size [Tb]')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))
group_owner['ndatasets'].plot(kind='pie', autopct='%.0f', ax=ax[0])
group_owner['ndatasets'].plot(kind='bar', ax=ax[1])
ax[1].set_ylabel('# datasets')
plt.show()

### Size per dataset

some users have huge datasets

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ave_size = group_owner['size'] / group_owner['ndatasets'] * 1024  # Gb
ave_size.sort_values(inplace=True)
ave_size.plot(kind='bar', ax=ax[0])
ave_size.plot(kind='bar', ax=ax[1])
ax[1].set_yscale('log')
ave = data['size'].sum() / float(len(data)) / (1024 ** 3)
ax[0].hlines(ave, *ax[0].get_xlim())
ax[1].hlines(ave, *ax[0].get_xlim())
print("mean dataset size = %.2f Gb" % ave)

### Average age

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
grouped_age_days = data.groupby(["owner"])["age_days"].mean()
grouped_age_days.sort(inplace=True)
grouped_age_days.plot(kind='bar', ax=ax)
ax.hlines(np.mean(data['age_days']), *ax.get_xlim())
ax.set_ylabel('age [days]')

### Average last access

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
grouped_lastacc_days = data.groupby(["owner"])["last_accessed_days"].mean()
grouped_lastacc_days.sort(inplace=True)
grouped_lastacc_days.plot(kind='bar', ax=ax)
ax.hlines(np.mean(data['last_accessed_days']), *ax.get_xlim())
ax.set_ylabel('last accessed [days]')

### Average life

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
grouped_life_days = data.groupby(["owner"])["life_days"].mean()
grouped_life_days.sort(inplace=True)
grouped_life_days.plot(kind='bar', ax=ax)
ax.hlines(np.mean(data['life_days']), *ax.get_xlim())
ax.set_ylabel('last acc - age [days]')

## History

In [None]:
import cufflinks as cf
cf.go_offline()

In [None]:
from pandas.io.pytables import HDFStore
store = HDFStore('store.h5')
print(store)
data = []
for k in store.keys():
    try:
        d = store.get(k)
        d['timestamp'] = pd.to_datetime(k.split("_")[1], format='%d%m%Y')
        data.append(d)
    except Exception as e:
        print("Problem reading", k)
        print(e)
store.close()        
print(len(data))
data = pd.concat(data)
data = data.set_index(['timestamp', 'owner'])

In [None]:
import json
class NumpyEncoder(json.JSONEncoder):

    def default(self, obj):
        """If input object is an ndarray it will be converted into a dict 
        holding dtype, shape and the data, base64 encoded.
        """
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        # Let the base class default method raise the TypeError
        return json.JSONEncoder(self, obj)

In [None]:
data_to_plot = data['size'].unstack().fillna(0)
dataplot = data_to_plot.iplot(kind='area', fill=True, asFigure=True)
for d in dataplot['data']:
    d['hoverinfo'] = 'text+x+name'
    d['text'] = ["%.2f Gb" % xx for xx in data_to_plot[d['name']].tolist()]
data.iplot(data=dataplot['data'])
f_json_data = open('data.json', 'w')
json_data = json.dump(dataplot['data'], f_json_data, cls=NumpyEncoder)
f_json_data.close()

In [None]:
data['size'].unstack().fillna(0).transpose().to_json(orient='split', date_format='iso')

In [None]:
data_to_plot = data['size'].unstack().fillna(0)
dataplot = data_to_plot.iplot(kind='scatter', fill=True, asFigure=True)
data.iplot(data=dataplot['data'])

f_json_data = open('data_scatter.json', 'w')
json_data = json.dump(dataplot['data'], f_json_data, cls=NumpyEncoder)
f_json_data.close()


In [None]:
latest_data = data[data.index.get_level_values('timestamp') == data.index.get_level_values('timestamp').max()]
dataplot = latest_data.reset_index().iplot(kind='pie', labels='owner', values='size', hole='0.4', sort=True, textinfo='percent', asFigure=True)
dataplot['data'][0]['text'] = ["%.2f Gb" % xx for xx in dataplot['data'][0]['values']]
dataplot['data'][0]['hoverinfo'] = 'text+label'
data.iplot(data=dataplot['data'])

f_json_data = open('data_pie.json', 'w')
json_data = json.dump(dataplot['data'], f_json_data, cls=NumpyEncoder)
f_json_data.close()