In [35]:
import pandas as pd
from functools import reduce
import os

In [146]:
class Processing():

  def __init__(self, file_name):
    self.file_name = file_name
  
  def merged_func(self, file_name):
    xlsx_file =  pd.ExcelFile(file_name)
    dfs = []
    for sheet_name in xlsx_file.sheet_names:
      df = pd.read_excel(xlsx_file,
                         sheet_name=sheet_name)
      df = df.set_index('Date (UTC)')
      df_resampled = df.resample('min').mean()\
                                     .reset_index()
      dfs.append(df_resampled)
    
    merged_df = reduce(lambda left, right: pd.merge(left, right,
                                                  on='Date (UTC)', 
                                                  how='inner' ),dfs)
    return merged_df
  
  def to_CST(self, data):
    data['Date (CST)'] = data['Date (UTC)'].dt.tz_localize('utc')\
                                       .dt.tz_convert('US/Central')\
                                       .dt.tz_localize(None)
    data = data.set_index('Date (CST)').reset_index()
    return data
  
  def change_name(self, data):
    new_names = {
        'Date (CST)': 'Date', 
        'Batch requests/sec for PRODSQL\LOCAL':'Batch requests/sec',
        'User connections for PRODSQL\LOCAL':'User connections',
        'SQL Server: processor time for PRODSQL\LOCAL':'Processor Time',
        'SQL Server: free memory for PRODSQL\LOCAL':'Free memory',
        'Page reads/sec for PRODSQL\LOCAL':'Page reads/sec',
        'Disk avg. read time for prodsql1-vm.wwwoodproducts.com > M: (SQL Data 1)':'Disk avg. read time > M: Data 1',
        'Disk avg. read time for prodsql1-vm.wwwoodproducts.com > N: (SQL Data 2)':'Disk avg. read time > N: Data 2',
        'Disk avg. read time for prodsql1-vm.wwwoodproducts.com > I: (SQL Index)':'Disk avg. read time > I: Index',
        'Disk avg. write time for prodsql1-vm.wwwoodproducts.com > M: (SQL Data 1)':'Disk avg. write time > M: Data 1',
        'Disk avg. write time for prodsql1-vm.wwwoodproducts.com > N: (SQL Data 2)':'Disk avg. write time > N: Data 2',
        'Disk avg. write time for prodsql1-vm.wwwoodproducts.com > I: (SQL Index)':'Disk avg. write time > I: Index',
        'Compilations/batch for PRODSQL\LOCAL':'Compilations/batch',
        'Latch wait time for PRODSQL\LOCAL':'Latch wait time'
       }
    data = data.rename(columns = new_names)
    return data
  
  def to_GB(self,data):
    data['Free memory'] = data['Free memory']/(1024**3)
    return data  
  
  def multiply_100(self,data):
    data['Compilations/batch'] = data['Compilations/batch']*100
    return data
  
  def combine_read_time(self,data):
    data['Disk avg. read time data'] = data['Disk avg. read time > M: Data 1'] + data['Disk avg. read time > N: Data 2']
    return data
  
  def filter_data(self, data):
    start_time = pd.to_datetime('05:00:00').time()
    end_time = pd.to_datetime('17:00:00').time()
    data = data[(data['Date'].dt.time >= start_time) & (data['Date'].dt.time <= end_time)]
    return data

  def run(self):
    data =  self.merged_func(self.file_name)
    data = self.to_CST(data)
    data = self.change_name(data)
    data = self.to_GB(data)
    data = self.multiply_100(data)
    data = self.combine_read_time(data)
    data = self.filter_data(data)
    return data


In [144]:
def data_processed(A_or_B):
  xlsx_files =  os.listdir(f'datasets/{A_or_B}')
  for xlsx_file in xlsx_files:
    data = Processing(f'datasets/{A_or_B}/{xlsx_file}')
    data = data.run()
    os.makedirs(f'processed_datasets/{A_or_B}', exist_ok=True)
    data.to_csv(f'processed_datasets/{A_or_B}/{xlsx_file}.csv')
  


In [145]:
data_processed('A')
data_processed('B')

In [152]:
def concat(A_or_B):
  xlsx_files = os.listdir(f'processed_datasets/{A_or_B}')
  dfs = []
  for xlsx_file in xlsx_files:
    df = pd.read_csv(f'processed_datasets/{A_or_B}/{xlsx_file}')
    dfs.append(df)
  data = pd.concat(dfs)
  data.to_csv(f'processed_datasets/{A_or_B}/full{A_or_B}.csv')
concat('A')
concat('B')
