In [1]:
#modules
import pandas as pd
import numpy as np
import sys
import os
from tqdm.notebook import tqdm 
import seaborn as sns
import matplotlib.pyplot as plt
from src import data
import matplotlib as mpl

In [2]:
from time import time

start_time = time()
dayDf = data.read_day('hawaii', 2012, 300)
print("%s seconds" % (time() - start_time))

dayDf.shape

  1%|          | 9/1595 [00:00<00:18, 87.23it/s]

Reading dataframes...


100%|██████████| 1595/1595 [00:18<00:00, 86.97it/s]


Concatenating dataframes...
23.021814107894897 seconds


(5761, 9570)

In [3]:
dayDf.head()

Unnamed: 0_level_0,pg2r__G03,pg2r__G03_lon,pg2r__G03_lat,pg2r__G03_h_ipp,pg2r__G03_ele,pg2r__G03_azi,pg2r__G17,pg2r__G17_lon,pg2r__G17_lat,pg2r__G17_h_ipp,...,pg2r__G26_lat,pg2r__G26_h_ipp,pg2r__G26_ele,pg2r__G26_azi,ktpm__G13,ktpm__G13_lon,ktpm__G13_lat,ktpm__G13_h_ipp,ktpm__G13_ele,ktpm__G13_azi
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-26 00:00:15,,,,,,,,,,,...,,,,,,,,,,
2012-10-26 00:00:30,-0.005581,-158.785435,23.827697,349966.870265,25.826925,323.940675,,,,,...,,,,,,,,,,
2012-10-26 00:00:45,,,,,,,,,,,...,,,,,,,,,,
2012-10-26 00:01:00,-0.005064,-158.752285,23.802335,350026.199881,26.017205,324.03158,,,,,...,,,,,,,,,,
2012-10-26 00:01:15,,,,,,,,,,,...,,,,,,,,,,


In [4]:
import dask.dataframe as dd
import datetime
import os
import pandas as pd
from pathlib import Path

In [8]:

def read_day(location: str = "hawaii", year: int = 2000, day_of_year: int = 300) -> pd.DataFrame:
    """
    Reads the data for a particular location and day of year.
    :param location: Specifies the location in which we want to load data (default: hawaii).
    :param year: Specifies the year in which to load data, specified as an integer (default: 2000).
    :param day_of_year: Specifies the day of year in which to load data, specified as an
    integer (default: 300).
    :return: A Pandas dataframe that includes the data for the specified location and day, with
    a Pandas datetime index and columns which represent combinations of satellites and ground
    stations.
    """

    # specify the root path to the data
    
    data_path = Path('/Users/hamlinliu/Documents/ACADEMIC/UCLA/sTEC_Project/sTEC-d-dt-Anomaly-Detection/src/data.py').parents[1] / "data"
    year = year
    day = str(day_of_year)
    location_year_doy_path = data_path / location / str(year) / day

    # collect the paths for each satellite
    satellite_paths = [location_year_doy_path / Path(p) for p in os.listdir(location_year_doy_path) if p != ".DS_Store"]

    # gather the data for each satellite from this day and location
    stec_dfs = list()
    stec_values = None
    first = True
    
    print("Reading dataframes...")
    
    for sat in tqdm(satellite_paths):
        
        sat_name = str(sat).split("/")[-1].split(".")[0][:4]
        ground_station_name = str(sat).split("_")[-1].split(".")[0]
        pass_id = sat_name + "__" + ground_station_name
        
        f = open(sat, 'r')
        line1 = f.readline()
        line1 = line1.replace('#', '').replace("dsTEC/dt [TECU/s]", "dsTEC/dt").replace("elev", "ele")
        rename_cols = line1.split()
        rename_cols.remove("sod")
        new_cols = list()
        for rn_col in rename_cols:
            new_col = pass_id + "_" + rn_col
            if rn_col == "dsTEC/dt":
                new_col = pass_id
            new_cols.append(new_col)
        new_cols = ["sod"] + new_cols
        
        
        #for rn_col in 
        #start_time = time()
        df = pd.read_table(
            sat,
            index_col='sod',
            sep="\t\t| ",
            names=new_cols,
            engine="python",
            skiprows=1
        )
        #print("%s seconds for reading sat" % (time() - start_time))
        

        # rename the columns
        
        new_cols.remove('sod')
    
        
        stec_dfs.append(df[new_cols])
    
   
    print("Concatenating dataframes...")
    # merge all of the satellite specific dataframes together
    
    stec_values = pd.concat(stec_dfs, axis=1)
    

    # convert second of day (sod) to timestamps
    sod = stec_values.index
    timestamps = list()
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(day_of_year - 1)
    
    for s in sod:

        # hours, minutes, seconds
        hours = int(s // 3600)
        minutes = int((s % 3600) // 60)
        seconds = int((s % 60))

        # create a datetime object and append to the list
        date_time = datetime.datetime(date.year, date.month, date.day, hours, minutes, seconds)
        timestamps.append(date_time)
    

    # set the timestamps as a Pandas DateTimeIndex
    df = stec_values.reset_index().drop(columns="sod")
    df["timestamp"] = timestamps
    df = df.set_index("timestamp")

    return df

In [9]:
start_time = time()
dayDf = read_day('hawaii', 2012, 300)
print("%s seconds" % (time() - start_time))

Reading dataframes...


HBox(children=(FloatProgress(value=0.0, max=1595.0), HTML(value='')))


Concatenating dataframes...
23.191694021224976 seconds


In [7]:
dayDf.head()

Unnamed: 0_level_0,pg2r__G03,pg2r__G03_lon,pg2r__G03_lat,pg2r__G03_h_ipp,pg2r__G03_ele,pg2r__G03_azi,pg2r__G17,pg2r__G17_lon,pg2r__G17_lat,pg2r__G17_h_ipp,...,pg2r__G26_lat,pg2r__G26_h_ipp,pg2r__G26_ele,pg2r__G26_azi,ktpm__G13,ktpm__G13_lon,ktpm__G13_lat,ktpm__G13_h_ipp,ktpm__G13_ele,ktpm__G13_azi
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-26 00:00:15,,,,,,,,,,,...,,,,,,,,,,
2012-10-26 00:00:30,-0.005581,-158.785435,23.827697,349966.870265,25.826925,323.940675,,,,,...,,,,,,,,,,
2012-10-26 00:00:45,,,,,,,,,,,...,,,,,,,,,,
2012-10-26 00:01:00,-0.005064,-158.752285,23.802335,350026.199881,26.017205,324.03158,,,,,...,,,,,,,,,,
2012-10-26 00:01:15,,,,,,,,,,,...,,,,,,,,,,
