# Principle Component Anaysis on QPESUMS dataset

This notebook demonstrates how to perform [Principle Component Analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) on the QPESUMS dataset.

In [29]:
# Loading libraries
import os, csv, logging, argparse
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, IncrementalPCA

# Data dimension
nLayer = 6
nY = 275
nX = 162

# Scan QPESUMS data in *.npy: 6*275*162 
def search_dbz(srcdir):
    import pandas as pd
    fileinfo = []
    for subdir, dirs, files in os.walk(srcdir, followlinks=True):
        for f in files:
            if f.endswith('.npy'):
                # Parse file name for time information
                furi = os.path.join(subdir, f)
                finfo = f.split('.')
                ftime = finfo[0]
                #logging.debug([furi] + finfo[1:3])
                fileinfo.append([furi, ftime])
    results = pd.DataFrame(fileinfo, columns=['furi', 'timestamp'])
    results = results.sort_values(by=['timestamp']).reset_index(drop=True)
    return(results)

# Read uris containing QPESUMS data in the format of 6*275*162 
def loadDBZ(flist):
    ''' Load a list a dbz files (in npy format) into one numpy array. '''
    xdata = []
    for f in flist:
        tmp = np.load(f)
        xdata.append(tmp)
    x = np.array(xdata, dtype=np.float32)
    return(x)

def fit_ipca_partial(finfo, nc=20, bs=100):
    ipca = IncrementalPCA(n_components=nc, batch_size=bs)
    # Loop through finfo
    for i in range(0, len(finfo), bs):
        # Read batch data
        dbz = []
        i2 = i + bs
        if i2>len(finfo):
            i2 = len(finfo)
        for j in range(i, i2):
            f = finfo[j]
            logging.debug('Reading data from: ' + f[0])
            tmp = read_dbz(f[0])
            # Append new record
            if tmp is not None:     # Append data if it is not None
                dbz.append(tmp)
        # Partial fit with batch data
        dbz = np.array(dbz)
        print(dbz.shape)
        ipca.partial_fit(dbz)
    #
    return(ipca)


def transform_dbz(ipca, finfo):
    dbz = []
    # Loop through finfo
    for i in range(0,len(finfo)):
        f = finfo[i]
        logging.debug('Reading data from: ' + f[0])
        tmp = read_dbz(f[0])
        # Append new record
        if tmp is None:     # Copy the previous record if new record is empty
            print('File empty: '+f[0])
            dbz.append(np.zeros(ipca.n_components))
        else:
            tmp = tmp.reshape(1,len(tmp))
            tmp = ipca.transform(tmp).flatten()
            dbz.append(tmp)
    # Save changes of the storage file
    print(np.array(dbz).shape)
    return(dbz)


def writeToCsv(output, fname, header=None):
    # Overwrite the output file:
    with open(fname, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_ALL)
        if header is not None:
            writer.writerow(header)
        for r in output:
            writer.writerow(r)
    return(0)


## Custumized Parameters

In the following section we need to specify some parameters such as the path to the QPESUMS data.

In [30]:
DATADIR = '/Users/tsyo/work.db/workspace/qpetw/dbz_2014061316/'
# Testing
finfo = search_dbz(DATADIR)
finfo

Unnamed: 0,furi,timestamp
0,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061301
1,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061302
2,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061303
3,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061304
4,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061305
...,...,...
91,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061620
92,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061621
93,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061622
94,/Users/tsyo/work.db/workspace/qpetw/dbz_201406...,2014061623


In [39]:
def correct_timestamp(ts):
    '''Check the time-stamp string in the form of YYYY-mm-dd-HH:
         - if HH = 24, increment the dd by one and change HH to 00
    '''
    import datetime
    if ts[8:] == '24':
        oldt = datetime.datetime.strptime(ts[:8], '%Y%m%d')
        newt = oldt + datetime.timedelta(days=1)
        newt_str = newt.strftime('%Y%m%d')+'00'
        return(newt_str)
    else:
        return(ts)

In [41]:
# Create a range of dates in Python
# https://stackoverflow.com/questions/993358/creating-a-range-of-dates-in-python
import datetime

ts0 = finfo['timestamp'][0]
ts1 = finfo['timestamp'][finfo.shape[0]-1]

print(ts0)
print(ts1)

print(correct_timestamp(ts0))
print(correct_timestamp(ts1))

starttime = datetime.datetime.strptime(correct_timestamp(ts0), '%Y%m%d%H')
endtime = datetime.datetime.strptime(correct_timestamp(ts1), '%Y%m%d%H')
timestep = datetime.timedelta(hours=1)
print(starttime)
print(endtime)

tslist = []
while starttime <= endtime:
    tslist.append(starttime.strftime('%Y%m%d%H'))
    starttime += timestep
    
print(tslist)

2014061301
2014061624
2014061301
2014061700
2014-06-13 01:00:00
2014-06-17 00:00:00
['2014061301', '2014061302', '2014061303', '2014061304', '2014061305', '2014061306', '2014061307', '2014061308', '2014061309', '2014061310', '2014061311', '2014061312', '2014061313', '2014061314', '2014061315', '2014061316', '2014061317', '2014061318', '2014061319', '2014061320', '2014061321', '2014061322', '2014061323', '2014061400', '2014061401', '2014061402', '2014061403', '2014061404', '2014061405', '2014061406', '2014061407', '2014061408', '2014061409', '2014061410', '2014061411', '2014061412', '2014061413', '2014061414', '2014061415', '2014061416', '2014061417', '2014061418', '2014061419', '2014061420', '2014061421', '2014061422', '2014061423', '2014061500', '2014061501', '2014061502', '2014061503', '2014061504', '2014061505', '2014061506', '2014061507', '2014061508', '2014061509', '2014061510', '2014061511', '2014061512', '2014061513', '2014061514', '2014061515', '2014061516', '2014061517', '2014