In [1]:
## Import needed modules ##

import pandas as pd
import os 

In [2]:
## Set the log header ##
## Below is Log entry format 2.0 ##
## You can find header in this document: https://docs.microsoft.com/en-us/rest/api/storageservices/storage-analytics-log-format ##

header=['<version-number>', '<request-start-time>', '<operation-type>',
       '<request-status>', '<http-status-code>', '<end-to-end-latency-in-ms>',
       '<server-latency-in-ms>', '<authentication-type>',
       '<requester-account-name>', '<owner-account-name>', '<service-type>',
       '<request-url>', '<requested-object-key>', '<request-id-header>',
       '<operation-count>', '<requester-ip-address>',
       '<request-version-header>', '<request-header-size>',
       '<request-packet-size>', '<response-header-size>',
       '<response-packet-size>', '<request-content-length>', '<request-md5>',
       '<server-md5>', '<etag-identifier>', '<last-modified-time>',
       '<conditions-used>', '<user-agent-header>', '<referrer-header>',
       '<client-request-id>', '<user-object-id>', '<tenant-id>',
       '<application-id>', '<audience>', '<issuer>', '<user-principal-name>',
       '<reserved-field>', '<authorization-detail>']

In [3]:
## Read all the subfolders ##

directory = "C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\" # Please change this to your own path
folder_list = []

for root, subdirectories, files in os.walk(directory):
    for subdirectory in subdirectories:
        folder_list.append(os.path.join(root, subdirectory))
folder_list = folder_list[4:]

print("Number of subfolders in total: ", len(folder_list))

print(folder_list[:2]) # Print out the first two items in the folder list

Number of subfolders in total:  24
['C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0100']


In [4]:
## Read all the log files ##

log_list = []

for subfolder in folder_list:
    file_list = []
    for root, subdirectories, files in os.walk(subfolder):
        for file in files:
            string = os.path.join(root, file)
            if string[-4:] == ".log":
                file_list.append(os.path.join(root, file))
    log_list.append(file_list)

print(log_list[:2]) # Print out the first hour log files

print(len(log_list[0])) # There are 10 files in the first subfolder (first hour)

[['C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000000.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000001.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000002.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000003.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000004.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000005.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000006.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000007.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000008.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0000\\000009.log'], ['C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0100\\000000.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0100\\000001.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0100\\000002.log', 'C:\\Users\\zoeylan\\AAA_Zoey_Working\\Logs\\0000\\0100\\000003.log', 'C:\\Users\\zoey

In [5]:
# pd.DataFrame(log_list)

In [6]:
## Create two empty dataframes to save final results ##
## Two examples here: one table for requester-ip-address and one for user-object-id ##

iptable = pd.DataFrame(columns= ['<requester-ip-address>', '<operation-type>'])
objectidtable = pd.DataFrame(columns=['<user-object-id>', '<operation-type>'])

print(iptable)
print(objectidtable)

Empty DataFrame
Columns: [<requester-ip-address>, <operation-type>]
Index: []
Empty DataFrame
Columns: [<user-object-id>, <operation-type>]
Index: []


In [7]:
## Read in all the logs and combine together in one table ##
## Do some filtering and group by work ##

i = 1 # Used to check the progress, should be 24 in total here

for hourlylog in log_list:
    
    log_df = pd.DataFrame(columns=header)
    print("Hour No.", i)
    
    for log in hourlylog:
        df = pd.read_csv(log, sep=';', low_memory=False, names = header)
        log_df = log_df.append(df, ignore_index=True)
        
        # Filter out "AppendFile" operation only as an example
        write = log_df[log_df['<operation-type>'] == 'AppendFile']
        
        # Count the total number of requests based on user-object-id & requester-ip-address
        objectid = pd.DataFrame(write.groupby(['<user-object-id>'])['<operation-type>'].count()).reset_index()
        ip = pd.DataFrame(write.groupby(['<requester-ip-address>'])['<operation-type>'].count()).reset_index()
        
        objectidtable = objectidtable.append(objectid, ignore_index=True)
        iptable = iptable.append(ip, ignore_index=True)
        
    i = i+1   

Hour No. 1
Hour No. 2
Hour No. 3
Hour No. 4
Hour No. 5
Hour No. 6
Hour No. 7
Hour No. 8
Hour No. 9
Hour No. 10
Hour No. 11
Hour No. 12
Hour No. 13
Hour No. 14
Hour No. 15
Hour No. 16
Hour No. 17
Hour No. 18
Hour No. 19
Hour No. 20
Hour No. 21
Hour No. 22
Hour No. 23
Hour No. 24


In [8]:
## Remove the duplicates and sum up the count ##

iptable = pd.DataFrame(iptable.groupby(['<requester-ip-address>'])['<operation-type>'].sum()).reset_index()
objectidtable = pd.DataFrame(objectidtable.groupby(['<user-object-id>'])['<operation-type>'].sum()).reset_index()

In [9]:
## Export as csv files ##

iptable.to_csv('iptable.csv')
objectidtable.to_csv('objectidtable.csv')