In [None]:
import numpy as np
import pandas as pd
# import json
# from pathlib import Path
import re
import datetime
from datetime import date

## Load log data and tag data

In [None]:
logDF = pd.read_csv('./dataframes/historicLogData.csv')
tagDF = pd.read_csv('./dataframes/tagData.csv')

In [None]:
logDF=logDF.fillna(0)

In [None]:
logDF.head()

In [None]:
tagDF.head()

## Prepare data for the aggregation
#### (extract necessary date info)

In [None]:
logDF.date = logDF.timestamp.apply(lambda x: datetime.date.fromtimestamp(x))

In [None]:
type(logDF.date[1])

In [None]:
logDF['calendar'] = logDF.date.apply(lambda x: x.isocalendar())
logDF['week'] = logDF.calendar.apply(lambda x: x[1])
logDF['month'] = logDF.calendar.apply(lambda x: x[2])

logDF.head()
# logDF.calendar[1]
# returns tupple (ISO year, ISO week number, ISO weekday)

In [None]:
# logDF['max_cpu']=(logDF.groupby('instance', as_index=False).agg({"usage_cpu": "max"}))['usage_cpu']

## Max and count times > x % throughout whole time period

In [None]:
aggregations = {
    'usage_cpu' : {
        'overall_avg':'mean',
        'max_cpu': 'max',
        'n_over80': lambda x: sum(x>80),
        'p_over40': lambda x: (sum(x>40)/len(x))*100       # percentage of times that usage is over 40%
    },
    'usage_nwin':{
        'max_nwin': 'max'
    },
    'usage_nwout':{
        'max_nwout': 'max'
    },    
}

In [None]:
grouped=logDF.groupby('instance', as_index=False).agg(aggregations)   # as_index=F to drop extra row indexes
grouped.columns = grouped.columns.droplevel(level=0)                  # drop extra row indexes

grouped.rename(columns={'': 'instance'}, inplace=True)
grouped

## Daily averages

In [None]:
# average per instance and day 
grouped2 = logDF.groupby(['instance','date'], as_index=False).agg({'usage_cpu':'mean',
                                                                  'usage_nwin': 'mean',
                                                                  'usage_nwout': 'mean'})
grouped2.head()

In [None]:
# daily avg is the avg of all the avgs per day
grouped3 = grouped2.groupby(['instance'], as_index=False).agg({'usage_cpu':'mean',
                                                                  'usage_nwin': 'mean',
                                                                  'usage_nwout': 'mean'})

In [None]:
grouped3.columns=['instance', 'cpu_daily_avg', 'nwin_daily_avg', 'nwout_daily_avg']
grouped3.head()

## Weekly averages

In [None]:
grouped4 = logDF.groupby(['instance','week'], as_index=False).agg({'usage_cpu':'mean',
                                                                  'usage_nwin': 'mean',
                                                                  'usage_nwout': 'mean'})
grouped4.head()

In [None]:
grouped5 = grouped4.groupby(['instance'], as_index=False).agg({'usage_cpu':'mean',
                                                                  'usage_nwin': 'mean',
                                                                  'usage_nwout': 'mean'})

In [None]:
grouped5.columns=['instance', 'cpu_weekly_avg', 'nwin_weekly_avg', 'nwout_weekly_avg']
grouped5.head()

## Monthly averages

In [None]:
grouped6 = logDF.groupby(['instance','month'], as_index=False).agg({'usage_cpu':'mean',
                                                                  'usage_nwin': 'mean',
                                                                  'usage_nwout': 'mean'})
grouped6.head()

In [None]:
grouped7 = grouped6.groupby(['instance'], as_index=False).agg({'usage_cpu':'mean',
                                                                  'usage_nwin': 'mean',
                                                                  'usage_nwout': 'mean'})

In [None]:
grouped7.columns=['instance', 'cpu_monthly_avg', 'nwin_monthly_avg', 'nwout_monthly_avg']
grouped7.head()

## Merge log data with new statistics

In [None]:
logs = pd.merge(grouped, grouped3, how='inner', on= ['instance'])
logs = pd.merge(logs, grouped5, how='inner', on= ['instance'])
logs = pd.merge(logs, grouped7, how='inner', on= ['instance'])
logs.head()


In [None]:
logDF.head()

## Merge log statistics with tag data

In [None]:
allData = pd.merge(tagDF, logs, how='inner', left_on='Host', right_on='instance')
allData.head()

In [None]:
# Remove instance column
allData.drop('instance', axis=1, inplace=True)

In [None]:
# Add admin or no admin column (factor variable)
allData['adm']=allData.Host.apply(lambda x: 1 if 'adm' in x else 0)

In [None]:
allData.head()

In [None]:
allData[['Host','p_over40','type']]

In [None]:
allData.describe()

In [None]:
allData.to_csv('./dataframes/allData.csv',sep=',',index=False, encoding="utf-8")