In [1]:
import pandas as pd
from functools import reduce
import os

In [2]:
class Processing():

  def __init__(self, file_name):
    self.file_name = file_name
  
  def merged_func(self, file_name):
    xlsx_file =  pd.ExcelFile(file_name)
    dfs = []
    for sheet_name in xlsx_file.sheet_names:
      df = pd.read_excel(xlsx_file,
                         sheet_name=sheet_name)
      df = df.set_index('Date (UTC)')
      df_resampled = df.resample('min').mean()\
                                     .reset_index()
      dfs.append(df_resampled)
    
    merged_df = reduce(lambda left, right: pd.merge(left, right,
                                                  on='Date (UTC)', 
                                                  how='inner' ),dfs)
    return merged_df
  
  def to_CST(self, data):
    data['Date (CST)'] = data['Date (UTC)'].dt.tz_localize('utc')\
                                       .dt.tz_convert('US/Central')\
                                       .dt.tz_localize(None)
    data = data.set_index('Date (CST)').reset_index()
    data = data.drop(columns=['Date (UTC)'])
    return data
  
  
  def change_name(self, data):
    new_names = {
        'Date (CST)': 'DateTime', 
        'Batch requests/sec for PRODSQL\LOCAL':'Batch requests/sec',
        'User connections for PRODSQL\LOCAL':'User connections',
        'SQL Server: processor time for PRODSQL\LOCAL':'Processor Time',
        'SQL Server: processor time for PRODSQL\ACCT': 'Processor Time',
        'SQL Server: free memory for PRODSQL\LOCAL':'Free memory',
        'Page reads/sec for PRODSQL\LOCAL':'Page reads/sec',
        'Disk avg. read time for prodsql1-vm.wwwoodproducts.com > M: (SQL Data 1)':'Disk avg. read time > M: Data 1',
        'Disk avg. read time for prodsql1-vm.wwwoodproducts.com > N: (SQL Data 2)':'Disk avg. read time > N: Data 2',
        'Disk avg. read time for prodsql1-vm.wwwoodproducts.com > I: (SQL Index)':'Disk avg. read time > I: Index',
        'Disk avg. write time for prodsql1-vm.wwwoodproducts.com > M: (SQL Data 1)':'Disk avg. write time > M: Data 1',
        'Disk avg. write time for prodsql1-vm.wwwoodproducts.com > N: (SQL Data 2)':'Disk avg. write time > N: Data 2',
        'Disk avg. write time for prodsql1-vm.wwwoodproducts.com > I: (SQL Index)':'Disk avg. write time > I: Index',
        'Compilations/batch for PRODSQL\LOCAL':'Compilations/batch',
        'Latch wait time for PRODSQL\LOCAL':'Latch wait time'
       }
    
    data = data.rename(columns = new_names)
    return data
  
  def set_time(self, data):
    data['Time'] = pd.to_datetime(data['DateTime']).dt.time
    data = data.set_index('Time').reset_index()
    return data
  
  def set_date(self, data):
    data['Date'] = pd.to_datetime(data['DateTime']).dt.date
    data = data.set_index('Date').reset_index()
    return data
  def set_day(self, data):
    data['Day_name'] = pd.to_datetime(data['DateTime']).dt.day_name()
    day_abbr = {
        'Monday': 'Mon',
        'Tuesday': 'Tue',
        'Wednesday': 'Wed',
        'Thursday': 'Thu',
        'Friday': 'Fri',
        'Saturday': 'Sat',
        'Sunday': 'Sun'
    }
    data['Day_name'] = data['Day_name'].map(day_abbr)
    data = data.set_index('Day_name').reset_index()
    return data
  
  def to_GB(self,data):
    data['Free memory'] = data['Free memory']/(1024**3)
    return data  
  
  def multiply_100(self,data):
    data['Compilations/batch'] = data['Compilations/batch']*100
    return data
  
  def combine_read_time(self,data):
    data['Disk avg. read time data'] = data['Disk avg. read time > M: Data 1'] + data['Disk avg. read time > N: Data 2']
    data['Disk avg. write time data'] = data['Disk avg. write time > M: Data 1'] + data['Disk avg. write time > N: Data 2']

    return data


  def run(self):
    data =  self.merged_func(self.file_name)
    data = self.to_CST(data)
    data = self.change_name(data)
    data = self.set_time(data)
    data = self.set_date(data)
    data = self.set_day(data)
    data = self.to_GB(data)
    data = self.multiply_100(data)
    data = self.combine_read_time(data)
    return data


In [3]:
def filter_data(data):
    start_time = pd.to_datetime('05:00:00').time()
    end_time = pd.to_datetime('16:00:00').time()
    data = data[(data['DateTime'].dt.time >= start_time) & (data['DateTime'].dt.time <= end_time)]
    return data

In [4]:
def data_processed(A_or_B,filter='full'):
  os.makedirs(f'processed_datasets/{filter}data', exist_ok=True)

  xlsx_files =  os.listdir(f'datasets/{A_or_B}')

  dfs = []
  dfs_filtered = []

  for xlsx_file in xlsx_files:
    data = Processing(f'datasets/{A_or_B}/{xlsx_file}')
    data = data.run()
    if filter == 'filtered':
      dfs_filtered.append(filter_data(data))
    dfs.append(data)

  if filter == 'filtered':
    pd.concat(dfs_filtered).to_csv(f'processed_datasets/{filter}data/{A_or_B}.csv', index=False)     
  else:
    pd.concat(dfs).to_csv(f'processed_datasets/{filter}data/{A_or_B}.csv', index=False) 
  


In [5]:
data_processed('A')
data_processed('B')
data_processed('A', 'filtered')
data_processed('B', 'filtered')