In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import collections
from pathlib import Path

In [2]:
np.random.seed(0)

path_city = r'/data/fang/data/GloabalTemp/raw/GlobalLandTemperaturesByCity.csv' # use your path
path_major_city = r'/data/fang/data/GloabalTemp/raw/GlobalLandTemperaturesByMajorCity.csv' # use your path

# colnames=['dt', 'Latitude', 'Longitude', 'Country','AverageTemperature'] 
mynames=['dt', 'Latitude', 'Longitude', 'AverageTemperature', 'City', 'Country', 'MajorCity']

df1 = pd.read_csv(path_city, index_col=None, header=0)
df2 = pd.read_csv(path_major_city,index_col=None, header=0)



In [10]:
# check the unique city of united states
print('there are %d unique cities in United States' % len(df1[df1['Country']=='United States']['City'].unique()))

# check the unique city of china
print('there are %d unique cities in China' % len(df1[df1['Country']=='China']['City'].unique()))

# get the data of united states and clean missing data of temperature
df_us = df1[df1['Country']=='United States']
df_us = df_us.dropna()
print(df_us.head())

# convert the date to datetime format year
df_us['dt'] = pd.to_datetime(df_us['dt'])
df_us['year'] = df_us['dt'].dt.year

# aggregate the temperature by year and latitude and longitude, drop the uncertainty
df_us = df_us.drop(['AverageTemperatureUncertainty'], axis=1)
df_us = df_us.groupby(['year','Latitude','Longitude','City']).mean().reset_index()
print(df_us.head())

# convert the latitude and longitude from str to float
df_us['lat'] = df_us['Latitude'].apply(lambda x: float(x[:-1]))
df_us['lon'] = df_us['Longitude'].apply(lambda x: float(x[:-1]))
df_us = df_us.drop(['Latitude','Longitude'], axis=1)

# normalize the temperature
# df_us['AverageTemperature'] = (df_us['AverageTemperature'] - df_us['AverageTemperature'].mean()) / df_us['AverageTemperature'].std()

# celsius to fahrenheit
df_us['AverageTemperature'] = df_us['AverageTemperature'] * 9/5 + 32

# summarize the df_us
print(df_us.describe())

there are 248 unique cities in United States
there are 371 unique cities in China
               dt  AverageTemperature  AverageTemperatureUncertainty     City  \
47555  1820-01-01               2.101                          3.217  Abilene   
47556  1820-02-01               6.926                          2.853  Abilene   
47557  1820-03-01              10.767                          2.395  Abilene   
47558  1820-04-01              17.989                          2.202  Abilene   
47559  1820-05-01              21.809                          2.036  Abilene   

             Country Latitude Longitude  
47555  United States   32.95N   100.53W  
47556  United States   32.95N   100.53W  
47557  United States   32.95N   100.53W  
47558  United States   32.95N   100.53W  
47559  United States   32.95N   100.53W  
   year Latitude Longitude              City  AverageTemperature
0  1743   28.13N    80.91W           Orlando              18.722
1  1743   28.13N    82.73W        Clearwater     

In [49]:
df_us
# find city with lon in the range of 76.85 and 85
# df_us[df_us['lon']<82][df_us['lon']>76.85]['City'].unique()
# df_us[df_us['lon']<130][df_us['lon']>120]['City'].unique()
df_us[df_us['lon']<115][df_us['lon']>100]['City'].unique()




  df_us[df_us['lon']<115][df_us['lon']>100]['City'].unique()


array(['Fort Collins', 'Aurora', 'Colorado Springs', 'Denver',
       'Highlands Ranch', 'Lakewood', 'Thornton', 'Westminster',
       'Abilene', 'Lubbock', 'Amarillo', 'Albuquerque', 'Pueblo',
       'Arvada', 'Salt Lake City', 'West Jordan', 'West Valley City',
       'El Paso', 'Provo', 'Nogales', 'Tucson', 'Chandler', 'Gilbert',
       'Glendale', 'Mesa', 'Peoria', 'Phoenix', 'Scottsdale', 'Tempe'],
      dtype=object)

In [38]:
df_us[df_us['lat']== 45.81]['City'].unique()

array(['Minneapolis', 'Saint Paul', 'Portland', 'Vancouver'], dtype=object)

In [12]:
def make_continues_mode(df, mode, normalize=10):

    target_df = df[mode]
    if normalize:
        df[mode+'_CONTI'] = normalize* (target_df-target_df.min())/(target_df.max()-target_df.min())

        # use dict to map the normalized value to the original value
        NORMAL_2_RAW_dict = {norm_value:real_value for norm_value,real_value in zip(np.sort(df[mode+'_CONTI'].unique()),np.sort(target_df.unique()))}

        RAW_2_NORMAL_dict = {real_value:norm_value for norm_value,real_value in zip(np.sort(df[mode+'_CONTI'].unique()),np.sort(target_df.unique()))}
        


    df[mode+'_DISCT'],CONTI_2_DISCT_dict,DISCT_2_CONTI_dict = unique_recoding(df[mode+'_CONTI'])
    print(mode,len(df[mode+'_DISCT'].unique()))

    return CONTI_2_DISCT_dict,DISCT_2_CONTI_dict,NORMAL_2_RAW_dict,RAW_2_NORMAL_dict

def unique_recoding(target_df):
    # colum_name = 'movieId'
    unique_key = np.sort(target_df.unique())
    CONTI_2_DISCT_dict = {key:id for id,key in enumerate(unique_key)}
    DISCT_2_CONTI_dict = {id:key for id,key in enumerate(unique_key)}


    new_column = target_df.apply(lambda x:CONTI_2_DISCT_dict[x])
    # data[colum_name] = new_column
    # print('ndim of %s is %d'%(colum_name,len(new_column.unique())))
    return new_column,CONTI_2_DISCT_dict,DISCT_2_CONTI_dict

CONTI_2_DISCT_dict_list = {}
DISCT_2_CONTI_dict_list = {}
NORMAL_2_RAW_dict_list = {}
RAW_2_NORMAL_dict_list = {}

for mode in ['lat','lon','year']:

    CONTI_2_DISCT_dict,DISCT_2_CONTI_dict,NORMAL_2_RAW_dict,RAW_2_NORMAL_dict = make_continues_mode(df_us,mode, normalize=1)

    CONTI_2_DISCT_dict_list[mode] = CONTI_2_DISCT_dict
    DISCT_2_CONTI_dict_list[mode] = DISCT_2_CONTI_dict
    NORMAL_2_RAW_dict_list[mode] = NORMAL_2_RAW_dict
    RAW_2_NORMAL_dict_list[mode] = RAW_2_NORMAL_dict


lat 15
lon 95
year 267


In [13]:
def process_disct_data(modes_BASE,dims_BASE,target_pollute='AverageTemperature'):
    df_base = df_us.copy()

    # df_DISCT = df.groupby(['year', 'month', 'day', 'station']).agg({'TEMP':'mean', 'PRES':'mean', 'DEWP':'mean', 'RAIN':'mean', 'wd':lambda x: collections.Counter(''.join(x)).most_common(2)[0][0] + collections.Counter(''.join(x)).most_common(2)[1][0], 'WSPM':'mean', 'PM2.5':'mean'}).reset_index()

    for i in range(len(dims_BASE)):
        target_mode = modes_BASE[i]
        DISCT_dim = dims_BASE[i]
        df_base[target_mode + '_BASE'] = pd.cut( df_base[target_mode +'_CONTI'], DISCT_dim).astype('category').cat.codes 

    target_modes_BASE = [ item + '_BASE' for item in modes_BASE]

    # df_base = df_base.groupby(target_modes_BASE).agg({target_pollute:'mean'}).reset_index()

    ndims = [df_base[mode].max()+1 for mode in target_modes_BASE]
    

    N = len(df_base)
    print(ndims, ' N=',N)
    Ntr = int(N * 0.8)
    idx = np.arange(N)
    folds = []

    for i in range(5):
        np.random.shuffle(idx)
        tr_idx = idx[:Ntr]
        tr_y = df_base[target_pollute][tr_idx].values
        tr_ind = df_base[target_modes_BASE].values[tr_idx,:]
        
        # print('fold=',i)

        te_idx = idx[Ntr:]
        te_y = df_base[target_pollute][te_idx].values
        te_ind = df_base[target_modes_BASE].values[te_idx,:]


        folds.append({
            'tr_ind': tr_ind,
            'tr_y': tr_y,
            'te_ind': te_ind,
            'te_y': te_y,
            'ndims':ndims,
        })

    data = {'ndims': ndims, 'data': folds}
    ndim_str = "x".join([str(i) for i in ndims])

    print()

    dict_name = '../US_temp/'
    Path(dict_name).mkdir(parents=True, exist_ok=True)

    # file_name = 'DISCT_' + ndim_str+'_no_agg'+'.npy'
    file_name = 'DISCT_' + ndim_str+'.npy'

    # print(file_name)
    np.save(dict_name + file_name, data)



In [15]:
# discretized data with different granularity, use for baseline 

modes_BASE = ['lat','lon','year']

dims_BASE = [15,95,267]

process_disct_data(modes_BASE,dims_BASE)

[15, 95, 267]  N= 55944



In [9]:
'''agg-version'''

# discretized data with different granularity, use for baseline 
# target_pollute = 'PM2.5'
# modes_BASE = ['TEMP','PRES','time']
# dims_BASE = [50,50,50]
# # dims_BASE = [428,501,1461] # full

# # target_pollute_list = ['PM2.5','PM10','SO2']
# target_pollute_list = ['PM10']

# dims_BASE_list = [[50,50,150],[100,100,300],[300,300,1000],[428,501,1461] ]

# for target_pollute in target_pollute_list:
#     for dims_BASE in dims_BASE_list:
        
#         process_disct_data(modes_BASE,dims_BASE,target_pollute)



[50, 50, 150]  N= 7455

[100, 100, 300]  N= 10320

[300, 300, 1000]  N= 11746

[428, 501, 1461]  N= 11954



In [16]:
df = df_us.copy()
def process_conti_data(target_pollute,target_modes):
    modes_DISCT = [mode+'_DISCT' for mode in target_modes]
    modes_CONTI = [mode+'_CONTI' for mode in target_modes]

    CONTI_2_DISCT_dicts = [CONTI_2_DISCT_dict_list[mode] for mode in target_modes]
    DISCT_2_CONTI_dicts = [DISCT_2_CONTI_dict_list[mode] for mode in target_modes]
    NORMAL_2_RAW_dicts = [NORMAL_2_RAW_dict_list[mode] for mode in target_modes]
    RAW_2_NORMAL_dicts = [RAW_2_NORMAL_dict_list[mode] for mode in target_modes]


    ndims = [df[mode].max()+1 for mode in modes_DISCT]
    print('ndims:',ndims)

    N = len(df)
    Ntr = int(N * 0.8)
    idx = np.arange(N)
    folds = []
    for i in range(5):
        np.random.shuffle(idx)
        tr_idx = idx[:Ntr]
        tr_y = df[target_pollute][tr_idx].values
        tr_ind_DISCT = df[modes_DISCT].values[tr_idx,:]
        tr_ind_CONTI = df[modes_CONTI].values[tr_idx,:]

        # print('fold=',i)
        # print(len(np.unique(tr_ind_DISCT[:,0])))
        # print(len(np.unique(tr_ind_DISCT[:,1])))
        # print(len(np.unique(tr_ind_DISCT[:,2])))

        te_idx = idx[Ntr:]
        te_y = df[target_pollute][te_idx].values
        te_ind_DISCT = df[modes_DISCT].values[te_idx,:]
        te_ind_CONTI = df[modes_CONTI].values[te_idx,:]

        # track the never-seen idx in test data
        never_seen_test_idx = []
        for mode in range(len(target_modes)):
            train_set = set(np.unique(tr_ind_DISCT[:,mode]))
            full_set = set(DISCT_2_CONTI_dicts[mode].keys())
            never_seen_test_idx.append(list(full_set.difference(train_set)))


        folds.append({
            'tr_ind_DISCT': tr_ind_DISCT,
            'tr_ind_CONTI': tr_ind_CONTI,
            'tr_y': tr_y,
            'te_ind_DISCT': te_ind_DISCT,
            'te_ind_CONTI': te_ind_CONTI,
            'te_y': te_y,
            'ndims':ndims,
            'never_seen_test_idx':never_seen_test_idx
        })

    data = {'ndims': ndims, 'data': folds,'CONTI_2_DISCT_dicts':CONTI_2_DISCT_dicts, 'DISCT_2_CONTI_dicts':DISCT_2_CONTI_dicts,
    'NORMAL_2_RAW_dicts':NORMAL_2_RAW_dicts,
    'RAW_2_NORMAL_dicts':RAW_2_NORMAL_dicts}
    ndim_str = "x".join([str(i) for i in ndims])

    dict_name = '../US_temp/'
    Path(dict_name).mkdir(parents=True, exist_ok=True)

    file_name = 'CONTI_' + ndim_str +'.npy'

    # file_name = 'conti_beijing_15k_'+'_'.join(target_modes)+'_'+target_pollute+'_'+ndim_str+'.npy'
    print(file_name)
    np.save(dict_name + file_name, data)


In [17]:
target_pollute='AverageTemperature'
target_modes = ['lat','lon','year']
process_conti_data(target_pollute,target_modes)

ndims: [15, 95, 267]
CONTI_15x95x267.npy
