In [179]:
from scipy.io import netcdf
import numpy as np
import matplotlib.pyplot as plt
from netCDF4 import Dataset
import pandas as pd
import matplotlib.pyplot as plt
import statistics

import matplotlib.style

import matplotlib

matplotlib.style.use('classic')

# Importing and working with data

In [180]:
import glob,os,sys
os.chdir('C:/Users/yashg/Documents/Cloud_Data_Files')
def read_files(extensions,location):
    l=[]
    for types in extensions:
        l.append(glob.glob(f'./{location}/*{types}'))
    l=[val for sublist in l for val in sublist]
    return l

In [181]:
k=read_files(['.cdf','.nc'],'KAZRARSCL')
m=read_files(['.cdf','.nc'],'Microbase')
r=read_files(['.cdf','.nc'],'Raman Lidar')
s=read_files(['.cdf','.nc'],'Surface')
e=read_files(['.cdf','.nc'],'Ext')
master=k+m+r+s+e

In [182]:
def date_files(date,master=master):
    f=[]
    for file in master:
        if date in file:
            f.append(file)
    return f

def generate_cdfs(date):
    l=[]
    f=date_files(date)
    for file in f:
        l.append(Dataset(file))
    print('Output has 5 files')
    print('File order is\t1.KAZRARSCL\t2.Microbase\t3.Raman Lidar\t4. Surface\t5. Ext')
    return l

In [183]:
date=['20110505','20110513','20110514','20110515','20110519','20110527','20110528','20110529','20110601']

In [184]:
l=generate_cdfs(date[1])
k1,m1,r1,s1,e1=l
r=['KAZRARSCL','Microbase','Raman Lidar','Surface','Ext']
v=['Velocity,Reflectivity,Spectral Width',
  'Liquid Water Content',
  'Height,Time,Temperature',
   'CCN',
  'Extinction']
lt=[]
lh=[]
for i,f in enumerate(l):
    if i<4:
        lt.append(len(f['time'][:]))
        if i==3:
            lh.append(0)
        else:
            lh.append(len(f['height'][:]))
    else:
        lt.append(len(f['time_offset'][:]))
        lh.append(len(f['height_high'][:]))
d={'Radar':r,'Time steps':lt,'Height steps':lh,'Extracted Variables':v}
dim=pd.DataFrame(d)
tk=np.ma.filled(k1['time'][:])
hk=np.ma.filled(k1['height'][:])
tm=np.ma.filled(m1['time'][:])
hm=np.ma.filled(m1['height'][:])
tr=np.ma.filled(r1['time'][:])
hr=np.ma.filled(r1['height'][:])*1000
ts=np.ma.filled(s1['time'][:])
te=np.ma.filled(e1['time_offset'][:])
he=np.ma.filled(e1['height_high'][:])*1000
v=np.ma.filled(k1['mean_doppler_velocity'][:])
ref=np.ma.filled(k1['reflectivity'][:])
sw=np.ma.filled(k1['spectral_width'][:])
lwc=np.ma.filled(m1['liquid_water_content'][:])
temp=np.ma.filled(r1['temperature'][:])+273.15
ccn=np.ma.filled(s1['N_CCN'][:])
ext=np.ma.filled(e1['extinction_merged_bscat'][:])
var=['Velocity','Reflectivity','Spectral_Width','Liquid_Water_Content','Temperature','CCN','Extinction']
shap=[v.shape,ref.shape,sw.shape,lwc.shape,temp.shape,ccn.shape,ext.shape]
mat={'Variable':var,'Shape':shap}
mat=pd.DataFrame(mat)

Output has 5 files
File order is	1.KAZRARSCL	2.Microbase	3.Raman Lidar	4. Surface	5. Ext


In [185]:
dim

Unnamed: 0,Radar,Time steps,Height steps,Extracted Variables
0,KAZRARSCL,21600,596,"Velocity,Reflectivity,Spectral Width"
1,Microbase,21600,596,Liquid Water Content
2,Raman Lidar,144,198,"Height,Time,Temperature"
3,Surface,1440,0,CCN
4,Ext,1440,1000,Extinction


In [186]:
mat

Unnamed: 0,Variable,Shape
0,Velocity,"(21600, 596)"
1,Reflectivity,"(21600, 596)"
2,Spectral_Width,"(21600, 596)"
3,Liquid_Water_Content,"(21600, 596)"
4,Temperature,"(144, 198)"
5,CCN,"(1440,)"
6,Extinction,"(1440, 1000)"


# Clustering Functions

In [187]:
#Extracting indices of extinction
def extinction(file,tr,hr,threshold=15,upper=100):
    base_h=np.zeros(len(tr))
    extin_h=np.zeros(len(tr))
    ht=np.zeros(len(tr))
    for i in range(len(file)):
        em=file[i]
        for k,j in enumerate(em):
            if j>threshold and j<upper:
                base_h[i]=k
                extin_h[i]=j
                ht[i]=hr[k]
                break
    return base_h,extin_h,ht

In [188]:
def temporal_clustering(t1,t2):
    """
    t2 is the heights for the higher resolved data whereas
    t1 is the heights for the lower resolved data
    eg: t1-Raman Lidar
        t2-Microbase,KAZRARSCL
    """
    t_index=[]
    for i in range(len(t1)-1):
        arg=[]
        for j in range(len(t2)):
            if t2[j]>=t1[i] and t2[j]<=t1[i+1]:
                arg.append(j)
        t_index.append(arg)
    return t_index

In [189]:
def height_clustering(h1,h2):
    u=[]
    for i in h1:
        x=np.argmin(np.abs(i-h2))
        u.append(x)
    return u

In [190]:
def time_clustering(t1,t2):
    u=[]
    for i in t1:
        x=np.argmin(np.abs(i-t2))
        u.append(x)
    return u

In [191]:
def spatial_temporal_clustering(h1,h2,t1,t2,pars):
    targs=temporal_clustering(t1,tm)
    hargs=height_clustering(h1,hm)
    u=[]
    for i in targs:
        f=[]
        for j in hargs:
            f.append([pars[ii,j] for ii in i])
        u.append(f)
    return u

In [192]:
def filtering(z1,z2,t1,t2,param):
    miss=param.missing_value
    pars=np.ma.filled(param[:])
    par=spatial_temporal_clustering(z1,z2,t1,t2,pars)
    avg=[]
    stdev=[]
    for i in range(len(par)):
        par[i]=np.array(par[i])
        par[i]=par[i][par[i]!=miss]
        par[i]=par[i][par[i]!=0]
        if par[i].mean()!=par[i].mean():
            avg.append(0)
            stdev.append(0)
        else:
            avg.append(par[i].mean())
            stdev.append(par[i].std())
    return avg,stdev

# Generalizing Data from Dates

In [193]:
def data_output(dat,low,up):
    k1,m1,r1,s1,e1=generate_cdfs(dat)
    tk=np.ma.filled(k1['time'][:])
    hk=np.ma.filled(k1['height'][:])
    tm=np.ma.filled(m1['time'][:])
    hm=np.ma.filled(m1['height'][:])
    tr=np.ma.filled(r1['time'][:])
    hr=np.ma.filled(r1['height'][:])*1000
    ts=np.ma.filled(s1['time'][:])
    te=np.ma.filled(e1['time_offset'][:])
    he=np.ma.filled(e1['height_high'][:])*1000
    v=np.ma.filled(k1['mean_doppler_velocity'][:])
    ref=np.ma.filled(k1['reflectivity'][:])
    sw=np.ma.filled(k1['spectral_width'][:])
    lwc=np.ma.filled(m1['liquid_water_content'][:])
    temp=np.ma.filled(r1['temperature'][:])+273.15
    ccn=np.ma.filled(s1['N_CCN'][:])
    ext=np.ma.filled(e1['extinction_merged_bscat'][:])  
    exr=r1['ext'][:]
    xx=np.argwhere((te>=low) & (te<=up))
    xx=[item for sublist in xx for item in sublist]
    e_ind,ex,h=extinction(ext,te,he)
    e1,e2,e3=extinction(exr,tr,hr)
    ttrial=time_clustering(te,tr)
    tem=[]
    for tt in ttrial:
        tem.append(temp[tt,int(e1[tt])])
    lwc1,dlwc1=filtering(he,hm,te,tm,m1['liquid_water_content'])
    v1,dv1=filtering(he,hk,te,tk,k1['mean_doppler_velocity'])
    sw1,dsw1=filtering(he,hk,te,tk,k1['spectral_width'])
    ref1,dref1=filtering(he,hk,te,tk,k1['reflectivity'])
    date=[dat]*len(lwc1)
    lll=[lwc1,dlwc1,v1,dv1,sw1,dsw1,ref1,dref1,tem]
    for t in lll:
        t=[t[i] for i in xx]
    ll=[date,h[1:],lwc1,dlwc1,v1,dv1,sw1,dsw1,ref1,dref1,tem[1:],ex[1:],ccn[1:]]
    var=['Date','Height','LWC','LWC_SD','Velocity','Velocity_SD','Spectral_Width','Spectral_Width_SD','Reflectivity','Reflectivity_SD','Temperature','Extinction','CCN']
    df={}
    for uu in range(len(ll)):
        df.update({var[uu]:ll[uu]})
    return pd.DataFrame(df)

In [194]:
d13=data_output('20110513',30000,85000)
d14=data_output('20110514',10000,50000)
#d191=data_output('20110519',0,15000)
#d192=data_output('20110519',35000,40000)
d29=data_output('20110529',500,1000)

Output has 5 files
File order is	1.KAZRARSCL	2.Microbase	3.Raman Lidar	4. Surface	5. Ext


  # This is added back by InteractiveShellApp.init_path()
  ret = ret.dtype.type(ret / rcount)


Output has 5 files
File order is	1.KAZRARSCL	2.Microbase	3.Raman Lidar	4. Surface	5. Ext
Output has 5 files
File order is	1.KAZRARSCL	2.Microbase	3.Raman Lidar	4. Surface	5. Ext


In [195]:
frames=[d13,d14,d29]
data_combined=pd.concat(frames,ignore_index=True)
data_combined.to_csv('C:/Users/yashg/Documents/Cloud_Data_Files/data_from_ext.csv')