In [1]:
import pandas as pd
import numpy as np
from os import listdir, remove, rename, replace
from os.path import isfile, join, isdir
import filecmp

In [2]:
COLUMNS = ['date', 'id', 'time', 'lat', 'lon', 'h', 'WGS', 'Ml', 'station', 'P/S', 'zeros', 'datetime', 'duration']

In [28]:
def fetch_files(path: str, ending: str) -> [str]:    
    '''
    Fetches all files in the given path  ending with '.bul'
    '''
    if path[-1] != '/':
        path += '/'
    files = []
    try:
        files = [f for f in listdir(path) if isfile(join(path, f))] 
    except:
        print("Trouble opening a directory")
        return files
    return [path + f for f in files if f[-len(ending):] == ending]

In [4]:
def open_files(files: [str]):
    ''' Tries to open all the given files and creates pandas dataframes'''
    dataframes = []
    counter = 0
    for f in files:
        try:
            df = pd.read_csv(f, delimiter=';', header=None, names=COLUMNS, index_col=False)
            dataframes.append(df)
            df['file'] = counter
            counter += 1
        except e:
            print('Chyba při čtení souboru', f)
            
    return dataframes

In [5]:
def drop_unnecessary_columns(df, cols: [str]):
    ''' Drops the fiven columns from the dataframe'''
    try:
        return df.drop(cols, axis=1)
    except e:
        print('Could not delete some these columns:', cols, 'Returning unchanged dataframe')
        return df

In [6]:
def merge_coordinates(df):
    ''' Merges 'lat' and 'lon' columns into 'coordinates' column and then drops the original ones '''
    df['coordinates'] = df['lat'].str.strip() + " " + df['lon'].str.strip()
    return df.drop(['lat', 'lon'], axis=1)

In [7]:
class Earthquake:
    def __init__(self, coordinates, earthquake_id, order_number):
        self.__coordinates = coordinates
        self.__earthquake_id = earthquake_id
        self.__order_number = order_number
        self.__records = []
        self.__output = pd.DataFrame(columns = ['STA', 'P', 'S', 'S-P'])
        self.__buffer = []

    
    @property 
    def coordinates(self) -> str:
        return self.__coordinates
    
    
    @property 
    def earthquake_id(self) -> int:
        return self.__earthquake_id
    
    
    @property 
    def order_number(self) -> int:
        return self.__order_number
    
    
    def add_record(self, record: list):
        ''' Adds a record to the "self.__records" list '''
        self.__records.append(record.drop(['coordinates']))
        
    
    def convert_to_pandas_df(self):
        ''' Converts a normal list to pandas dataframe '''
        self.__records = pd.DataFrame(self.__records)
    
    
    def process_records(self):
        ''' Processes the records and transforms them into an output-like format list '''
        self.__records.datetime = self.__records.datetime - self.__records.datetime.min()
        self.__records['zero_time'] = self.__records.datetime.astype(np.int64) // 10**6
        self.__records = self.__records.sort_values('zero_time')

        for row in self.__records.iterrows():
            self.process_line(row[1])
            
        size = len(self.__buffer)
        for i in range(size):
            print('S value before a P value at', self.__earthquake_id)
            self.process_line(self.__buffer[i])
        
        
    def process_line(self, line: list):
        ''' Tranfrorms a single line to an output-like format and saves it to the "self.__output" lists'''
        if line['P/S'].strip() == 'P':
            output_line = {}
            output_line['STA'] = line.station
            output_line['P'] = line.zero_time
            output_line['S'] = 99999
            output_line['S-P'] = 99999
            self.__output = self.__output.append(output_line, ignore_index = True)
            
        elif line['P/S'].strip() == 'S':
            station = line.station
            if len(self.__output[self.__output.STA == station].index.values) == 0:
                self.__buffer.append(line)
                return
            index = self.__output[self.__output.STA == station].index.values[0]
            self.__output.loc[index]['S'] = line.zero_time
            self.__output.loc[index]['S-P'] = line.zero_time - self.__output.loc[index]['P']
            
    def create_output_files(self, path):
        ''' Creates output files from the 'self.__output' dataframe '''
        if path[-1] != '/':
            path += '/'
        complete_path = path + str(self.earthquake_id) + '_' + str(self.order_number) + '.csv'
        print('output to', complete_path)
        self.__output.to_csv(complete_path, index=False, sep='\t')

In [25]:
def make_groups(df):
    ''' Divides the data into groups based on the 'id' column and then into subgroups based on the 'coordinates' column '''
    groups = dict.fromkeys(df.id.unique())
#     eq_number = 0
    for key in groups.keys():
#         eq_number += 1
        tmp = df[df.id == key]
        subgroups = dict.fromkeys([key for key, _ in tmp.groupby(['coordinates'])])
        order_number = 0
        for row in tmp.iterrows():
            if subgroups[row[1]['coordinates']] == None:
                subgroups[row[1]['coordinates']] = Earthquake(row[1]['coordinates'], str(key)[4:], order_number)
                order_number += 1
            subgroups[row[1]['coordinates']].add_record(row[1])
        groups[key] = subgroups
    return groups

In [9]:
def divide_data(df):
    groups = make_groups(df)
    for group in groups.values():
        for earthquake in group.values():
            earthquake.convert_to_pandas_df()
            earthquake.process_records()
            earthquake.create_output_files('../output')


In [41]:
def compare_files(path):
    ''' Compares the dupplicate files, if they are the same then removes one and renames the other '''
    if path[-1] != '/':
            path += '/'
    files = fetch_files(path, '_0.csv')    
    for file in files:
        if isfile(file[0:-6] + "_1.csv"):
            print(file)
            if filecmp.cmp(file[0:-6] + "_0.csv", file[0:-6] + "_1.csv"):
                print("Files are the same => removing ")
                remove(file[0:-6] + "_1.csv")
                if isfile(file[0:-6] + "_0.csv"):
                    replace(file[0:-6] + "_0.csv", file[0:-6] + ".csv")
            else:
                print('Files are different => keeping both versions')
        else:
            if isfile(file):
                replace(file, file[0:-6] + ".csv")


In [36]:
def initialize():
    df = pd.concat(open_files(fetch_files('../data', '.bul')))
    df = df.reset_index(drop=True)
    df = merge_coordinates(df)
    cols = ['h', 'WGS', 'Ml', 'date', 'time', 'zeros', 'duration', 'file']
    df = drop_unnecessary_columns(df, cols)
    df['station'] = df['station'].str.strip()
    df['datetime'] = pd.to_datetime(df['datetime'], format=" %Y-%m-%d %H:%M:%S.%f")
    divide_data(df)

In [43]:
initialize()
compare_files('../output')

output to ../output/4746_0.csv
output to ../output/4763_0.csv
output to ../output/4799_0.csv
output to ../output/4801_0.csv
output to ../output/4812_0.csv
output to ../output/4812_1.csv
output to ../output/4829_0.csv
output to ../output/4877_0.csv
output to ../output/4902_0.csv
output to ../output/4905_0.csv
output to ../output/4923_1.csv
output to ../output/4923_0.csv
output to ../output/5008_1.csv
output to ../output/5008_0.csv
output to ../output/5015_0.csv
output to ../output/5017_0.csv
output to ../output/5048_0.csv
output to ../output/5048_1.csv
output to ../output/5051_0.csv
output to ../output/5051_1.csv
output to ../output/5052_0.csv
output to ../output/5052_1.csv
output to ../output/5054_0.csv
output to ../output/5055_0.csv
output to ../output/5055_1.csv
output to ../output/5058_1.csv
output to ../output/5058_0.csv
S value before a P value at 5060
output to ../output/5060_1.csv
S value before a P value at 5060
output to ../output/5060_0.csv
output to ../output/5065_0.csv
outp