In [10]:
#This notebook takes in the monthly 6 minute co-ops nwlon station meteorological data
#and separates it into files based on measurement type, one for each measurement type per each monthly file.
#The resulting files will have an abbreviation of their measurement name attatched to their file name before .csv

#To use this notebook, you must download the data from the NWLON site in 6-minute format, and place the name of one
#of the files in the testname variable (this only needs to be done once for each year), you may also want to check 
#that the name variable when it prints is the correct id of the station, with a * for glob. 
#After this a list of the files this notebook will split will be displayed along with a count of the files.
#Now you can run the notebook from top to bottom (run all) and it will generate all required files,
# where vis and relhum will most likely be empty except for a select few stations with these parameters.

#If the diff test fails, you will see a dataframe printout that includes the problem rows, so you can
#deduce what happened/what is different and make a correction.

#To use the file remove cell at the bottom, so it does not take a long time to delete unwanted files, set remove_flg=True,
#this will remove all generated files.
#Set both remove_flg and remove_all_flg to true to remove all files (generated and origionals). 
#****Remember if you use the remove files cell, to set both boolean variables false before you perform any other
#actions****

#This notebook is designed to operate with only the current station's files (generated and origionals) in the 
#directory containing this notebook, to avoid problems with glob. This is why the file remove cell is included.

In [2]:
import numpy as np
import pandas as pd
import glob
import os

In [3]:
testname='CO-OPS_8736897_from_20140101_to_20140131_met.csv'
name=testname[:14]+"*"#'%s%s'%('CO-OPS_8736897','*')
print('name='+name)
col_to_extdict={' WINDSPEED':'ws.csv',' DIR':'wd.csv',' GUSTS':'gs.csv',' AT':'at.csv',' BARO':'bp.csv',' RELHUM':'rh.csv',' VIS':'vs.csv'}
count=0
for fname in glob.glob(name):#files to be split
    print(fname)
    count+=1
print('count=%s'%(count,))

name=CO-OPS_8736897*
CO-OPS_8736897_from_20140801_to_20140831_met.csv
CO-OPS_8736897_from_20140401_to_20140430_met.csv
CO-OPS_8736897_from_20140101_to_20140131_met.csv
CO-OPS_8736897_from_20141001_to_20141031_met.csv
CO-OPS_8736897_from_20140201_to_20140228_met.csv
CO-OPS_8736897_from_20140701_to_20140730_met.csv
CO-OPS_8736897_from_20140601_to_20140630_met.csv
CO-OPS_8736897_from_20140301_to_20140331_met.csv
CO-OPS_8736897_from_20141201_to_20141231_met.csv
CO-OPS_8736897_from_20140501_to_20140531_met.csv
CO-OPS_8736897_from_20141101_to_20141130_met.csv
CO-OPS_8736897_from_20140901_to_20140930_met.csv
count=12


In [4]:
def split_fs(fname):
    df=pd.read_csv(fname, index_col=0, low_memory=False)
    for colname in col_to_extdict:
        dff=df[colname]
        dff.to_csv('%s%s'%(fname[:-7],col_to_extdict[colname]),header=colname[1:])
        
    return

In [5]:
for lname in glob.glob(name):
    split_fs(lname)

In [6]:
#diff test
flg=False
for lname in glob.glob(name):#files to be split
    if not 'met.csv' in lname:#ensures we do not read files we have generated, as we remove this extension
        continue
    dftest=pd.read_csv(lname, index_col=0, low_memory=False)
    #print(dftest.columns)
    #print(dftest)
    for col in col_to_extdict:
        dftemp=dftest[[col]]
        dfmy=pd.DataFrame()
        myname='%s%s'%(lname[:-7],col_to_extdict[col])
        dfmy=pd.read_csv(myname, index_col=0, low_memory=False)
        #print(dftemp)
        #print(dfmy)
        if not dftemp.equals(dfmy):
            flg=True
            print('files are not identical\n%s and %s'%(lname,myname))
            print('problem rows')
            newdf=pd.concat([dftemp,dfmy]).drop_duplicates(keep=False)
            print(newdf)

if not flg:
    print('all files are the same')

all files are the same


In [8]:
#file remove cell for convienence
remove_flg=False
remove_all_flg=False

if remove_flg:
    for fname in glob.glob("CO-OPS_8736897*"):
        if not 'met.csv' in fname and not remove_all_flg:
            os.remove(fname)
        if remove_all_flg:
            os.remove(fname)