In [4]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import collections
from pathlib import Path

In [5]:
np.random.seed(0)

path = r'/data/fang/data/Beijing/4x6x12x6/PRSA_Data_20130301-20170228' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

df = frame[['year', 'month', 'day', 'hour', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']]
df = df.dropna()
# print(df.head())
df = df.groupby(['year', 'month', 'day', 'station']).agg({'TEMP':'mean', 'PRES':'mean', 'DEWP':'mean', 'RAIN':'mean', 'wd':lambda x: collections.Counter(''.join(x)).most_common(2)[0][0], 'WSPM':'mean', 'PM2.5':'mean', 'PM10':'mean', 'SO2':'mean', 'NO2':'mean', 'CO':'mean', 'O3':'mean'}).reset_index()
print(df.head())

   year  month  day       station      TEMP         PRES       DEWP  RAIN wd  \
0  2013      3    1  Aotizhongxin  1.391667  1026.875000 -18.745833   0.0  N   
1  2013      3    1     Changping  0.812500  1023.858333 -19.583333   0.0  N   
2  2013      3    1      Dingling -0.950000  1024.025000 -19.625000   0.0  N   
3  2013      3    1        Dongsi  1.728571  1029.152381 -20.980952   0.0  N   
4  2013      3    1      Guanyuan  1.391667  1026.875000 -18.745833   0.0  N   

       WSPM     PM2.5       PM10        SO2        NO2          CO         O3  
0  3.254167  7.125000  10.750000  11.708333  22.583333  429.166667  63.875000  
1  2.133333  5.083333  18.958333  16.041667  15.333333  387.500000  77.791667  
2  2.337500  6.250000   7.250000   3.000000   2.625000  212.500000  78.875000  
3  3.142857  6.714286  10.666667   8.714286  29.571429  409.523810  72.857143  
4  3.254167  7.541667  11.666667   8.500000  28.500000  400.000000  63.166667  


In [6]:
print(df['TEMP'].max(), df['TEMP'].min()) # 4
print(df['RAIN'].max(), df['RAIN'].min()) # 3
print(df['PRES'].max(), df['PRES'].min()) # 3
print(df['DEWP'].max(), df['DEWP'].min()) # 3
print(df['WSPM'].max(), df['WSPM'].min()) # 3
# print(df['PM2.5'].max(), df['PM2.5'].min())
# print(df.shape)

df['time'] = pd.to_datetime(df[['year', 'month', 'day']]).astype(np.int64) // 10 ** 9 

df['time'] =(df['time'] - df['time'].mean())/(df['time'].std())
print(len(df['time'].unique()))
len(df)

df['PM2.5'] = (df['PM2.5'] - df['PM2.5'].mean()) / df['PM2.5'].std() 
df['PM10'] = (df['PM10'] - df['PM10'].mean()) / df['PM10'].std() 
df['SO2'] = (df['SO2'] - df['SO2'].mean()) / df['SO2'].std()
df['NO2'] = (df['NO2'] - df['NO2'].mean()) / df['NO2'].std()
df['CO'] = (df['CO'] - df['CO'].mean()) / df['CO'].std()
df['O3'] = (df['O3'] - df['O3'].mean()) / df['O3'].std()

36.98571428571428 -15.691304347826087
11.108333333333334 0.0
1041.0249999999999 985.6
27.616666666666664 -33.020833333333336
7.2375 0.0
1461


In [7]:
def make_continues_mode(df, mode, decimals=1, normalize=10):

    target_df = df[mode].round(decimals)
    if normalize:
        df[mode+'_CONTI'] = normalize* (target_df-target_df.min())/(target_df.max()-target_df.min())

    df[mode+'_DISCT'],CONTI_2_DISCT_dict,DISCT_2_CONTI_dict = unique_recoding(df[mode+'_CONTI'])
    print('decimal: %d '%(decimals),mode,len(df[mode+'_DISCT'].unique()))

    return CONTI_2_DISCT_dict,DISCT_2_CONTI_dict

def unique_recoding(target_df):
    # colum_name = 'movieId'
    unique_key = np.sort(target_df.unique())
    CONTI_2_DISCT_dict = {key:id for id,key in enumerate(unique_key)}
    DISCT_2_CONTI_dict = {id:key for id,key in enumerate(unique_key)}


    new_column = target_df.apply(lambda x:CONTI_2_DISCT_dict[x])
    # data[colum_name] = new_column
    # print('ndim of %s is %d'%(colum_name,len(new_column.unique())))
    return new_column,CONTI_2_DISCT_dict,DISCT_2_CONTI_dict

CONTI_2_DISCT_dict_list = {}
DISCT_2_CONTI_dict_list = {}
for mode in ['TEMP','PRES','RAIN','DEWP']:
    CONTI_2_DISCT_dict, DISCT_2_CONTI_dict = make_continues_mode(df,mode, decimals=1, normalize=10)
    CONTI_2_DISCT_dict_list[mode] = CONTI_2_DISCT_dict
    DISCT_2_CONTI_dict_list[mode] = DISCT_2_CONTI_dict
    
mode = 'time'
CONTI_2_DISCT_dict, DISCT_2_CONTI_dict = make_continues_mode(df,mode, decimals=3, normalize=10)
CONTI_2_DISCT_dict_list[mode] = CONTI_2_DISCT_dict
DISCT_2_CONTI_dict_list[mode] = DISCT_2_CONTI_dict

print(len(df))

decimal: 1  TEMP 428
decimal: 1  PRES 501
decimal: 1  RAIN 46
decimal: 1  DEWP 558
decimal: 3  time 1461
16965


In [10]:
def process_disct_data(modes_BASE,dims_BASE,target_pollute):
    df_base = df.copy()

    # df_DISCT = df.groupby(['year', 'month', 'day', 'station']).agg({'TEMP':'mean', 'PRES':'mean', 'DEWP':'mean', 'RAIN':'mean', 'wd':lambda x: collections.Counter(''.join(x)).most_common(2)[0][0] + collections.Counter(''.join(x)).most_common(2)[1][0], 'WSPM':'mean', 'PM2.5':'mean'}).reset_index()

    for i in range(len(dims_BASE)):
        target_mode = modes_BASE[i]
        DISCT_dim = dims_BASE[i]
        df_base[target_mode + '_BASE'] = pd.cut( df_base[target_mode +'_CONTI'], DISCT_dim).astype('category').cat.codes 

    target_modes_BASE = [ item + '_BASE' for item in modes_BASE]

    # df_base = df_base.groupby(target_modes_BASE).agg({target_pollute:'mean'}).reset_index()

    ndims = [df_base[mode].max()+1 for mode in target_modes_BASE]
    

    N = len(df_base)
    print(ndims, ' N=',N)
    Ntr = int(N * 0.8)
    idx = np.arange(N)
    folds = []

    for i in range(5):
        np.random.shuffle(idx)
        tr_idx = idx[:Ntr]
        tr_y = df_base[target_pollute][tr_idx].values
        tr_ind = df_base[target_modes_BASE].values[tr_idx,:]
        
        # print('fold=',i)

        te_idx = idx[Ntr:]
        te_y = df_base[target_pollute][te_idx].values
        te_ind = df_base[target_modes_BASE].values[te_idx,:]


        folds.append({
            'tr_ind': tr_ind,
            'tr_y': tr_y,
            'te_ind': te_ind,
            'te_y': te_y,
            'ndims':ndims,
        })

    data = {'ndims': ndims, 'data': folds}
    ndim_str = "x".join([str(i) for i in ndims])

    print()

    # dict_name = '../beijing/'+'_'.join(modes_BASE)+'_'+target_pollute+'/'
    dict_name = '../beijing/'+'beijing_'+target_pollute+'/'

    Path(dict_name).mkdir(parents=True, exist_ok=True)

    # file_name = 'DISCT_' + ndim_str+'_no_agg'+'.npy'
    file_name = 'DISCT_' + ndim_str+'.npy'

    # print(file_name)
    np.save(dict_name + file_name, data)

In [11]:
# discretized data with different granularity, use for baseline 
target_pollute = 'PM2.5'
modes_BASE = ['TEMP','PRES','time']
dims_BASE = [50,50,50]
# dims_BASE = [428,501,1461] # full

# target_pollute_list = ['PM2.5','PM10','SO2']
target_pollute_list = ['PM10']

dims_BASE_list = [[50,50,150],[100,100,300],[300,300,1000],[428,501,1461] ]

for target_pollute in target_pollute_list:
    for dims_BASE in dims_BASE_list: 
        
        process_disct_data(modes_BASE,dims_BASE,target_pollute)

[50, 50, 150]  N= 16965

[100, 100, 300]  N= 16965

[300, 300, 1000]  N= 16965

[428, 501, 1461]  N= 16965



In [9]:
'''agg-version'''

# discretized data with different granularity, use for baseline 
# target_pollute = 'PM2.5'
# modes_BASE = ['TEMP','PRES','time']
# dims_BASE = [50,50,50]
# # dims_BASE = [428,501,1461] # full

# # target_pollute_list = ['PM2.5','PM10','SO2']
# target_pollute_list = ['PM10']

# dims_BASE_list = [[50,50,150],[100,100,300],[300,300,1000],[428,501,1461] ]

# for target_pollute in target_pollute_list:
#     for dims_BASE in dims_BASE_list:
        
#         process_disct_data(modes_BASE,dims_BASE,target_pollute)



[50, 50, 150]  N= 7455

[100, 100, 300]  N= 10320

[300, 300, 1000]  N= 11746

[428, 501, 1461]  N= 11954



In [7]:
def process_conti_data(target_pollute,target_modes):
    modes_DISCT = [mode+'_DISCT' for mode in target_modes]
    modes_CONTI = [mode+'_CONTI' for mode in target_modes]

    CONTI_2_DISCT_dicts = [CONTI_2_DISCT_dict_list[mode] for mode in target_modes]
    DISCT_2_CONTI_dicts = [DISCT_2_CONTI_dict_list[mode] for mode in target_modes]


    ndims = [df[mode].max()+1 for mode in modes_DISCT]
    print('ndims:',ndims)

    N = len(df)
    Ntr = int(N * 0.8)
    idx = np.arange(N)
    folds = []
    for i in range(5):
        np.random.shuffle(idx)
        tr_idx = idx[:Ntr]
        tr_y = df[target_pollute][tr_idx].values
        tr_ind_DISCT = df[modes_DISCT].values[tr_idx,:]
        tr_ind_CONTI = df[modes_CONTI].values[tr_idx,:]

        # print('fold=',i)
        # print(len(np.unique(tr_ind_DISCT[:,0])))
        # print(len(np.unique(tr_ind_DISCT[:,1])))
        # print(len(np.unique(tr_ind_DISCT[:,2])))

        te_idx = idx[Ntr:]
        te_y = df[target_pollute][te_idx].values
        te_ind_DISCT = df[modes_DISCT].values[te_idx,:]
        te_ind_CONTI = df[modes_CONTI].values[te_idx,:]

        # track the never-seen idx in test data
        never_seen_test_idx = []
        for mode in range(len(target_modes)):
            train_set = set(np.unique(tr_ind_DISCT[:,mode]))
            full_set = set(DISCT_2_CONTI_dicts[mode].keys())
            never_seen_test_idx.append(list(full_set.difference(train_set)))


        folds.append({
            'tr_ind_DISCT': tr_ind_DISCT,
            'tr_ind_CONTI': tr_ind_CONTI,
            'tr_y': tr_y,
            'te_ind_DISCT': te_ind_DISCT,
            'te_ind_CONTI': te_ind_CONTI,
            'te_y': te_y,
            'ndims':ndims,
            'never_seen_test_idx':never_seen_test_idx
        })

    data = {'ndims': ndims, 'data': folds,'CONTI_2_DISCT_dicts':CONTI_2_DISCT_dicts, 'DISCT_2_CONTI_dicts':DISCT_2_CONTI_dicts}
    ndim_str = "x".join([str(i) for i in ndims])

    # dict_name = '../beijing/'+'_'.join(target_modes)+'_'+target_pollute+'/'
    dict_name = '../beijing/'+'beijing_'+target_pollute+'/'


    Path(dict_name).mkdir(parents=True, exist_ok=True)

    file_name = 'CONTI_' + ndim_str +'.npy'


    # file_name = 'conti_beijing_15k_'+'_'.join(target_modes)+'_'+target_pollute+'_'+ndim_str+'.npy'
    print(file_name)
    np.save(dict_name + file_name, data)


In [8]:
# target_pollute_list = ['PM2.5','PM10','NO2']
target_pollute_list = ['PM10']

target_modes = ['TEMP','PRES','time']


for target_pollute in target_pollute_list:
    process_conti_data(target_pollute,target_modes)

ndims: [428, 501, 1461]
CONTI_428x501x1461.npy


In [25]:
str()

''