In [1]:
#this notebook is designed to access the unavco gps/gnss ftp system, and from the
#file list generated by their DAIv2 interface it will download all of those files,
#since their download manager would not function properly on any of my browsers.

#In addition to this, this notebook separates the measurements in the file by measurement type
#and maintains the RINEX text format
#there is also a diff test of origional files with the specified index to my created files
#set the flag speceifications

#eventually, I plan to thread this as it takes some time for each yearly list
#at the moment, must be placed in directory above 'Desktop'

In [2]:
import os
import pandas as pd
import numpy as np
from ftplib import FTP
import shutil
import time

In [3]:
#selection/specification by user
dirpath="Downloads/testdir"#setup the directory you would like files to be put, preferrably 1 level away from 
                         #where you want the zip to end up
savepath='p001'#path for processed files to be zipped, name of station
ext='.csv'#can be changed to match year extension of files

#required files
paths=pd.read_fwf('2fileList.jsp.txt',header=None,skiprows=2)#get paths from filelist, from DAIv2


#constants
my_col_lab=['year','month','day','hour','minute','second','pressure','temp dry','relative humidity','wind speed','wind direction','rain inc','hail inc']
run_diff_test=True
headerlist= {'col1': ['PR','TD','HR','WS','WD','RI','HI']}
headerdf=pd.DataFrame(data=headerlist)
ftp = FTP('data-out.unavco.org')
fnames=pd.DataFrame()
headercoldict={'PR':'pressure','TD':'temp dry','HR':'relative humidity','WS':'wind speed','WD':'wind direction','RI':'rain inc','HI':'hail inc'}
headerextdict={'PR':'_PR','TD':'_TD','HR':'_HR','WS':'_WS','WD':'_WD','RI':'_RI','HI':'_HI'}
idxdict={'PR':6,'TD':7,'HR':8,'WS':9,'WD':10,'RI':11,'HI':12}
header_rep='    PR    TD    HR    WS    WD    RI    HI'

In [4]:
#set up directories
if not os.path.exists(dirpath):
    os.makedirs(dirpath)

for state in headerextdict:
    if not os.path.exists(dirpath+'/'+savepath+headerextdict[state]):
        os.makedirs(dirpath+"/"+savepath+headerextdict[state])

In [5]:
ftp.login()#unavco's ftp is public, no need for username/pass

'230 Any password will work'

In [6]:
fnames=paths[0].str.slice(49)

In [7]:
paths=paths[0].str.slice(26,49)

In [8]:
def file_dance(thefname):
    with open(thefname+ext) as f:
        newText=f.read().replace('"', '')#remove '"'

    with open(thefname+ext, "w") as f:
        if "END OF HEADER\n                       \n" in newText:#fix header to remove extra line, configure here
            newText=newText.replace("END OF HEADER\n                       \n","END OF HEADER\n",1)
        elif 'END OF HEADER\n                        \n' in newText:
            newText=newText.replace("END OF HEADER\n                        \n","END OF HEADER\n",1)
        else:
            print('ERROR header lf replacement failed')
        f.write(newText)
    return

In [9]:
def filemanip(filename,num,totsize):
    with open(filename, "wb") as gFile:#open file, write binary to my copy
        ftp.retrbinary('RETR '+filename, gFile.write)
    os.system("gunzip "+filename)#unzip using system gunzip!!,0 means success
    tmpname=filename[:-2]#remove unix compress extension
    
    for headerrem in headerextdict:
        with open(tmpname, "r") as p:#this is to grab the header for later
            s=''
            count=0
            flg=False;
            for line in p.readlines():
                #print(line)
                if '# / TYPES OF OBSERV' in line:#this may be where I process the different headers
                    line=line.replace('7','1')
                    #for y in range(0,7):
                        #if not headerdf['col1'][y]==headerrem:
                    line=line.replace(header_rep,'    '+headerrem+'                                    ')
                            #line=line.replace(headerdf['col1'][y],'  ')
                    flg=True
        
                if not flg:
                    s+=line;
                elif flg and headerrem in line:
                    s+=line
                count=count+1;
                if ' END OF HEADER' in line:
                    s+=line
                    break
        #print("count="+str(count))#and how many lines the header is
        df=pd.read_fwf(tmpname,header=None,sep=" ",skiprows=count,widths=[3,3,3,3,3,3,7,7,7,7,7,7,7])
        #spacing is fixed so this should work for all files discounting the string header
        df.columns=my_col_lab
        #rename columns for convienence
    
        #append header and dataframe
        dff=pd.DataFrame()
        dff=dff.append({'string_values': s}, ignore_index=True)
        dff=dff.append(df[['year','month','day','hour','minute','second',headercoldict[headerrem]]],sort=False)
        dff.reset_index(inplace=True,drop=True)
    
        #spacing and removal of nan's, to get correct string
        dff=dff.fillna(7777)#this value is dummy and should hopefully be unique, to replace
        dff[['year','month','day','hour','minute','second']]=dff[['year','month','day','hour','minute','second']].astype(int)
        dff.replace(to_replace=7777,value='',inplace=True)#replace
    
        #deal with spacing, adds " in string conversion, lambda applied to each cell
        dff[['month','day','hour','minute','second']]=dff[['month','day','hour','minute','second']].applymap(lambda x:  ' '*(2-len(str(x)))+str(x) )
        dff[headercoldict[headerrem]]=dff[headercoldict[headerrem]].apply(lambda x: ' '*(6-len(str(x)))+str(x) )#adjust select column spacing here
    
        #save new file
        fname=filename[:-6]
        dff.to_csv(savepath+headerextdict[headerrem]+'/'+fname+ext,sep=" ",header=None,index=False)
        #print(savepath+headerextdict[headerrem]+'/'+fname+ext)
    
        #remove '"' and rename for proper format
        file_dance(savepath+headerextdict[headerrem]+'/'+fname)
    print(str(totsize-x)+'    ',end='\r')
    return

In [10]:
os.chdir(dirpath)

In [11]:
start_time = time.time()
for x in range(0,paths.size):#manipulate every file from every path
    ftp.cwd(paths[x])#how I navigate
    filemanip(fnames[x],x,paths.size)
    ftp.cwd("/")#reset navigation
end_time= time.time()
print('------------finished------------')
print("tot time: %s seconds" % (end_time - start_time))
print('approx. time per source file %s seconds' % str(int(end_time - start_time)/paths.size))
print('approx. time per my file %s seconds' % str(int(end_time - start_time)/(paths.size*7)))

------------finished------------
tot time: 269.24455189704895 seconds
approx. time per source file 0.736986301369863 seconds
approx. time per my file 0.10528375733855186 seconds


In [12]:
#when this hits 1 you are done for that stations measurements for that year

In [13]:
os.getcwd()#should contain our created directories

'/Users/nickvancise/Downloads/testdir'

In [14]:
#os.chdir("..")#if dir to zip is 1 lvl away
#os.getcwd()

In [15]:
for headerrem in headerextdict:
    shutil.make_archive(savepath+headerextdict[headerrem], 'zip', savepath+headerextdict[headerrem])#zip for download/upload
print('---zips created---')

---zips created---


In [16]:
def diff_test():
    count=0
    not_ident=False
    for name in fnames:
        tmpname=name[:-2]
        #the following lines may need to be adjusted for different header lengths
        dftest1=pd.read_fwf(tmpname,skiprows=15,sep='+s/',header=None,widths=[3,3,3,3,3,3,7,7,7,7,7,7,7])#orig file
        tmpname=tmpname[:-4]+ext
        for headerrem in idxdict:
            count=count+1
            dftest2=pd.read_fwf(savepath+headerextdict[headerrem]+'/'+tmpname,skiprows= 9 if headerrem=='PR' else 8,sep='+s/',header=None,widths=[3,3,3,3,3,3,8])#my file
            print(str(count)+'           ',end='\n' if (count>=fnames.size*7) else '\r')
            orig_idx=[0,1,2,3,4,5,idxdict[headerrem]]
            my_idx=[0,1,2,3,4,5,6]
            dftemp=dftest1[orig_idx]
            dftemp.columns=[0,1,2,3,4,5,6]
            #print(dftemp)
            #print(dftest2)
            if not dftemp.equals(dftest2):
                print ("The lists are not identical "+headerextdict[headerrem]+": "+name[:-2]+" "+tmpname)#col is  zero indexed
                not_ident=True
                newdf=pd.concat([dftemp,dftest2]).drop_duplicates(keep=False)
                print('problem rows')
                print(newdf)
                #sys.exit('pausing')
    if not_ident:
        return False
    return True

In [17]:
#will print the number of files it has checked, and all the files with differences
#to their corresponding pair are listed, and check flag is set

In [18]:
#import sys
if run_diff_test:
    diff_start_time = time.time()
    b=diff_test()
    diff_end_time=time.time()
    if b:
        print('all files are identical')
    print("diff time: %s seconds" % (diff_end_time - diff_start_time))

2555           
all files are identical
diff time: 13.378273010253906 seconds
