This notebook contains the methods of reading several common type of data format in atmospheric research 

In [28]:
%matplotlib inline

# .nc

## use Dataset:

In [5]:
from netCDF4 import Dataset
fname = ('/sn3/wyu/grid_out/mls_t_theta_2005_2016_monthly.nc')
f = Dataset(fname)
lon = f.variables['Lon'][:]
temp = f.variables['Temp'][:]


## use xarray

In [6]:
import xarray as xr, numpy as np, pandas as pd

In [11]:
fname = ('/sn3/wyu/grid_out/mls_t_theta_2005_2016_monthly.nc')
f = xr.open_dataset(fname)
temp = f.Temp
lon = f.Lon

In [18]:
# average or choose a slice
temp_mean = temp.mean(dim='Lon')
temp_slice = temp.sel(Lon=slice(0,180))

In [14]:
# change the time dimension to a smarter mode:
f['Time'] = pd.date_range('2005-05-01',periods=len(f['Time']),freq='M')

In [26]:
# so you can choose a slice of time regarding to year, month, or even season
temp = f.Temp
temp_season = temp.groupby('Time.season').mean(dim='Time')
temp_year = temp.groupby('Time.year').mean(dim='Time')
temp_month = temp.groupby('Time.month').mean(dim='Time')

  return np.nanmean(a, axis=axis, dtype=dtype)


In [12]:
# define a new array in xarray
da = xr.DataArray([9,1,5,3,2],dims=['lat'],coords={'lat':[10,20,30,40,50]})

## save .nc files

In [None]:
fout = '/sn3/wyu/grid_out/testfile.nc'
f2 = Dataset(fout,'w')
Lat = f2.createDimension('Lat',181)

Lat = f2.createVariable('Lat','f',('Lat',))
Lat.units = "degree"
Lat.long_name = "Latitude"
Lat[:] = np.arange(181)-90

Test_var = f2.createVariable('Test_var','f',('Lat',))
Test_var.long_name = "test variable"
Test_var[:] = np.arange(181)


# .sav 

In [56]:
from scipy.io import readsav
file_name = '/sn3/wyu/ttl_traj_out/190519_s100_i100_ERAi_MLS_6hrly_back_201108_NA/traj_s100_i100_2011_3120_I106901.sav'
f = readsav(file_name)

the output f is a dict containing all the information in the sav file 

# .dat

In [57]:
import pickle
file_name = '/ice3/hao/TTL/pfister_conv/freq_z_th_avg_2x2.dat'
f=open(file_name, 'rb')
data = pickle.load(f,encoding='latin1')           # pay attention to the encoding when loading 

the output data is a dict containing all the information in the sav file 

# .he5

In [69]:
import h5py
filename = '/co2/hao/download_mls/raw_data/h2o/MLS-Aura_L2GP-H2O_v04_2011d001.he5'
var = 'H2O'
f = h5py.File(filename)
loc = f.require_group('/HDFEOS/SWATHS/'+var+'/Geolocation Fields')
lon = loc['Longitude'][:]
lat = loc['Latitude'][:]
pre = loc['Pressure'][:]
t   = loc['Time'][:]

var1  = f.require_group('/HDFEOS/SWATHS/'+var+'/Data Fields')
value = var1['L2gpValue'][:]
miss  = var1['L2gpValue'].attrs['MissingValue']
prec  = var1['L2gpPrecision'][:]
qual  = var1['Quality'][:]
conv  = var1['Convergence'][:]
stat  = var1['Status'][:]

# .txt or other similar format

## use pandas

In [163]:
import pandas as pd
data_name = '/sn2/wyu/ACE/ACE-FTS_L2_v3.5-6_ASC/ss78725v3.6tangrid.asc'
data = pd.read_csv(data_name,sep=' +',header=9,index_col=None)

  This is separate from the ipykernel package so we can avoid doing imports until


you must be very careful and examine if it is read in the correct way, usually examine first five lines

In [171]:
data.head(5)

Unnamed: 0,z,T,T_fit,P,(atm),dens,H2O,H2O_err,O3,O3_err,...,CFC113,CFC113_err,HCFC141b,HCFC141b_err,HCFC142b,HCFC142b_err,HFC134a,HFC134a_err,CO2,CO2_err
0,5.1,231.9,0,0.49,1.55e+19,9.1e-05,5.49e-05,1.07e-07,2.49e-08,3.43e-07,...,-999.0,-999.0,-999.0,2.98e-11,1.19e-11,-999.0,-999.0,0.000396,2.7e-05,
1,6.2,230.09,0,0.416,1.33e+19,3.5e-05,2.18e-05,1.12e-07,1.06e-08,3.04e-07,...,-999.0,-999.0,-999.0,2.26e-11,4.82e-12,-999.0,-999.0,0.000392,1.1e-05,
2,7.5,227.53,0,0.342,1.1e+19,1.7e-05,2.43e-05,3.01e-07,1.41e-08,3.15e-07,...,8.6e-12,5.92e-11,4.19e-11,2.11e-11,6.37e-12,-999.0,-999.0,0.000395,1.4e-05,
3,8.8,226.07,0,0.281,9.11e+18,8e-06,5.29e-07,4.5e-07,1e-08,3.01e-07,...,5.75e-12,4.54e-12,2.66e-11,1.75e-11,4.26e-12,-999.0,-999.0,0.000392,1e-05,
4,10.1,227.14,0,0.229,7.41e+18,5e-06,5.13e-07,7.38e-07,9.18e-09,2.85e-07,...,6.16e-12,3.55e-11,2.16e-11,2.3e-11,4.25e-12,-999.0,-999.0,0.000394,1e-05,


## use build-in function 

In [1]:
data_name = '/sn2/wyu/ACE/ACE-FTS_L2_v3.5-6_ASC/ss78725v3.6tangrid.asc'
header = 12
data = open(data_name)
read_data = data.read().split('\n')
data_len = len(read_data)-header
data_column = len(read_data[header].split())
f = np.zeros([data_column,data_len])
for i in range(data_len):
    arr = [float(i) for i in read_data[i+header].split()]
    if len(arr):
        f[:,i] = arr
column_name = read_data[header-1].split()

NameError: name 'np' is not defined

In [120]:
# wrap in a function 
'''
function of reading data

input:
data_name: name of the file
header: number of header rows (column name is included) 

output:
f: 2-d data content
column_name: the variable name of each column
Wandi Yu
04.27.2016
'''
def read_data(data_name,header):
    data = open(data_name)
    read_data = data.read().split('\n')
    data_len = len(read_data)-header
    data_column = len(read_data[header].split())
    f = np.zeros([data_column,data_len])
    for i in range(data_len):
        arr = [float(i) for i in read_data[i+header].split()]
        if len(arr):
            f[:,i] = arr
    column_name = read_data[header-1].split()
    return f,column_name

In [121]:
# example: 
data_name = '/sn2/wyu/ACE/ACE-FTS_L2_v3.5-6_ASC/ss78725v3.6tangrid.asc'
data,name = read_data(fname,12)