In [2]:
######################## This jupyter file is all things grid dataset ########################

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from spatial import MultipleGrid  ### Use this when MultipleGrid class is not directly being used
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

In [60]:
def _cluster_coords(coordinates: pd.DataFrame, method: str, n_clusters: int):
    if "x" not in coordinates.columns:
        raise Exception("'x' must be in the input coordinate columns.")
    if "y" not in coordinates.columns:
        raise Exception("'y' must be in the input coordinate columns.")
    if method not in ["GaussianMixture", "KMeans"]:
        raise Exception("Inappropriate method.")

    np.random.seed(1000)
    unique_coors = coordinates.drop_duplicates()
    if method == 'GaussianMixture':
        cluster_model = GaussianMixture(n_components=n_clusters).fit(unique_coors)
    elif method == "KMeans":
        cluster_model = KMeans(n_clusters=n_clusters).fit(unique_coors)
    coor_pred = cluster_model.predict(unique_coors)
    whole_pred = cluster_model.predict(coordinates)
    coor_clusters = unique_coors.copy()
    coor_clusters["cluster id"] = coor_pred
    whole_df = pd.DataFrame(whole_pred, index=coordinates.index, columns=["cluster id"])
    return whole_df, coor_clusters
    
def _split_in_cluster(coordinates: pd.DataFrame):
    if "x" not in coordinates.columns:
        raise Exception("'x' must be in the input coordinate columns.")
    if "y" not in coordinates.columns:
        raise Exception("'y' must be in the input coordinate columns.")

    _, coor_nums = np.unique(coordinates, axis=0, return_counts=True)
    large_num_coors = coordinates[coor_nums>=np.percentile(coor_nums,60)]
    train_idx = np.random.choice(large_num_coors.index, size=len(coor_nums)//10)
    train_coors = coordinates.loc[train_idx]
    test_coors = coordinates.drop(train_idx)
    return train_coors, test_coors

def _split_train_test(xy_cluster: pd.DataFrame):
    if type(xy_cluster) is not pd.DataFrame:
        raise Exception("xy_cluster data is not pd.DataFrame.")

    np.random.seed(1000)
    unique_xy_cluster = xy_cluster.drop_duplicates()
    all_split_grids, all_split_data_id = {}, {}
    for cluster in np.sort(pd.unique(unique_xy_cluster["cluster id"])):
        cluster_xy = unique_xy_cluster.loc[unique_xy_cluster["cluster id"]==cluster, ["x","y"]]
        out_cluster_xy = unique_xy_cluster.loc[unique_xy_cluster["cluster id"]!=cluster, ["x","y"]].drop_duplicates()
        cluster_train, cluster_test = _split_in_cluster(cluster_xy)
        all_split_grids[cluster] = {
            "train_in_cluster":cluster_train,
            "train_out_cluster":out_cluster_xy,
            "test_cluster":cluster_test
        }
        all_split_data_id[cluster] = {
            "train_in_cluster":xy_cluster.index[np.isin(xy_cluster[["x","y"]], cluster_train).min(axis=1)],
            "train_out_cluster":xy_cluster.index[np.isin(xy_cluster[["x","y"]], out_cluster_xy).min(axis=1)],
            "test_cluster":xy_cluster.index[np.isin(xy_cluster[["x","y"]], cluster_test).min(axis=1)]
        }
    return all_split_grids, all_split_data_id

def _split_train_test_nouq(xy_cluster: pd.DataFrame):
    if type(xy_cluster) is not pd.DataFrame:
        raise Exception("xy_cluster data is not pd.DataFrame.")

    np.random.seed(1000)
#     unique_xy_cluster = xy_cluster.drop_duplicates()
    all_split_grids, all_split_data_id = {}, {}
    for cluster in np.sort(pd.unique(xy_cluster["cluster id"])):
        cluster_xy = xy_cluster.loc[xy_cluster["cluster id"] == cluster, ["x","y"]]
        out_cluster_xy = xy_cluster.loc[xy_cluster["cluster id"] != cluster, ["x","y"]]
        
#         cluster_train, cluster_test = _split_in_cluster(cluster_xy)
        all_split_grids[cluster] = {
            "train_in_cluster":cluster_xy,
            "train_out_cluster":out_cluster_xy,
        }
        all_split_data_id[cluster] = {
            "train_in_cluster":xy_cluster.index[np.isin(xy_cluster[["x","y"]], cluster_xy).min(axis=1)],
            "train_out_cluster":xy_cluster.index[np.isin(xy_cluster[["x","y"]], out_cluster_xy).min(axis=1)],
        }
    return all_split_grids, all_split_data_id

class SingleGrid:
    def __init__(self, cluster_method="GaussianMixture", cluster_num=10):
        self.cluster_num = cluster_num
        self.cluster_method = cluster_method

    def cluster_grids(self, input_dt: pd.DataFrame, target_dt: pd.Series):
        if type(input_dt) is not pd.DataFrame:
            raise Exception("Input data type is not pd.DataFrame.")
        if type(target_dt) is not pd.Series:
            raise Exception("Target data type is not pd.Series.")
        if not input_dt.index.equals(target_dt.index):
            raise Exception("Input and Output indexes are not equal.")
        if "x" not in input_dt.columns:
            raise Exception("'x' must be in the input data columns.")
        if "y" not in input_dt.columns:
            raise Exception("'y' must be in the input data columns.")

        return _cluster_coords(input_dt[["x", "y"]], self.cluster_method, self.cluster_num)

    def split_train_test(self, input_dt: pd.DataFrame, whole_cluster: pd.DataFrame):
        if type(input_dt) is not pd.DataFrame:
            raise Exception("Input data type is not pd.DataFrame.")
        if type(whole_cluster) is not pd.DataFrame:
            raise Exception("Whole Cluster data type is not pd.DataFrame.")

        xy_cluster = input_dt[["x", "y"]].join(whole_cluster)
#         return _split_train_test(xy_cluster)
        return _split_train_test_nouq(xy_cluster)
        
class MultipleGrid(SingleGrid):
    def __init__(self, grid_scale, cluster_method="GaussianMixture", cluster_num=10):
        super().__init__(cluster_method, cluster_num)
        self.grid_scale = grid_scale

    def extract_center_data(self, tag_names: list, input_dt: np.ndarray):
        if type(input_dt) is not np.ndarray:
            raise Exception("Input data type is not np.array.")
        if input_dt.shape[1] != (len(tag_names)*(self.grid_scale**2)):
            raise Exception("Input data column number is inappropriate.")
            
        center_cell_id = (self.grid_scale**2)//2
        center_dt = input_dt[:,center_cell_id*len(tag_names):(center_cell_id+1)*len(tag_names)]
        return center_dt

    def cluster_grids(self, tag_names: list, input_dt: np.ndarray, target_dt: pd.Series):
        if type(input_dt) is not np.ndarray:
            raise Exception("Input data type is not np.array.")
        if input_dt.shape[1] != (len(tag_names)*(self.grid_scale**2)):
            raise Exception("Input data column number is inappropriate.")
        if type(target_dt) is not pd.Series:
            raise Exception("Target data type is not pd.Series.")
        if len(input_dt) != len(target_dt):
            raise Exception("The length of input data and target data are not equal.")

        center_dt = self.extract_center_data(tag_names, input_dt)
        center_frame = pd.DataFrame(center_dt, columns=tag_names, index=target_dt.index)
#         print(center_frame)
        return super().cluster_grids(center_frame, target_dt)

    def split_train_test(self, tag_names: list, input_dt: np.ndarray, whole_cluster: pd.DataFrame):
        if type(input_dt) is not np.ndarray:
            raise Exception("Input data type is not np.array.")
        if input_dt.shape[1] != (len(tag_names)*(self.grid_scale**2)):
            raise Exception("Input data column number is inappropriate.")
        if type(whole_cluster) is not pd.DataFrame:
            raise Exception("Whole Cluster data type is not pd.DataFrame.")

        center_dt = self.extract_center_data(tag_names, input_dt)
        center_frame = pd.DataFrame(center_dt, columns=tag_names, index=whole_cluster.index)
        return super().split_train_test(center_frame, whole_cluster)
    
    def generate_grid(self, index_list: list, whole_grid: np.ndarray, whole_label: np.ndarray):
        if type(whole_grid) is not np.ndarray:
            raise Exception("Grid data type is not np.array.")
        if whole_grid.shape[1] != (len(tag_names)*(self.grid_scale**2)):
            raise Exception("Grid data column number is inappropriate.")
        
        axis_val = 0
        new_grid = np.take(whole_grid, index_list, axis_val)
        new_label = np.take(whole_label, index_list, axis_val)
        return new_grid, new_label

In [81]:
def _create_tags(tag_num):
    first_tags = ['x', 'y']
    rest_tags = [f'tag{x}' for x in range(tag_num-2)]
    return first_tags + rest_tags

def _get_train_test(cluster_train_test, cluster_id):
    set_dt = cluster_train_test[cluster_id]
    train_in = set_dt['train_in_cluster']
    train_out = set_dt['train_out_cluster']
    test_dt = set_dt['test_cluster']
    return train_out, train_in, test_dt

def _get_target_source(cluster_train_test, cluster_id):
    set_dt = cluster_train_test[cluster_id]
    train_in = set_dt['train_in_cluster']
    train_out = set_dt['train_out_cluster']
    return train_out, train_in


def _plot_train_test(cluster_train_test, cluster_id):
    set_dt = cluster_train_test[cluster_id]
    train_in = set_dt['train_in_cluster']
    train_out = set_dt['train_out_cluster']
    test_dt = set_dt['test_cluster']
    plt.scatter(train_out['x'], train_out['y'], 3, 'b', label='out-of-cluster train data')
    plt.scatter(test_dt['x'], test_dt['y'], 3, 'r', label='test data')
    plt.scatter(train_in['x'], train_in['y'], 5, 'g', label='in-cluster train data')
    plt.legend()
    # plt.show()
    plt.savefig(f'US_data/us-2011/clusters/cluster_{cluster_id}/cluster{cluster_id}train-test set')
    plt.cla()
    plt.clf()

def _plot_grid_clusters(coor_cluster):
    plt.scatter(coor_cluster['x'], coor_cluster['y'], 3, coor_cluster['cluster id'])
    plt.savefig('coordinate cluster')
    plt.cla()
    plt.clf()

### splitting the target dataset such that only k monitoring stations are included for transfer learning.
def create_TL_targetsource_split(mn_df, train_test_grids, x_tr_blended, y_tr_blended, cluster_no):
    source_cluster, target_cluster = _get_target_source(train_test_grids, cluster_no)
    target_cluster_idxlist = target_cluster.index.values.tolist()
#     mn_df_region = mn_df.iloc[target_cluster_idxlist]
    
    target_df = target_cluster.drop_duplicates(subset=["x", "y"])
    
#     for idx in range(1,10):
    idx = 1
    temp_target_df = target_df.sample(n=10)
    temp_test_df = target_df.loc[~target_df.index.isin(temp_target_df.index)]    

    temp_target_df['loc_tuple'] = list(zip(temp_target_df.x, temp_target_df.y)) ### create tuple for x and y
    target_df = target_cluster.merge(pd.DataFrame(temp_target_df['loc_tuple'].tolist(), columns=['x','y'])) ### find all the rows in target_df which are from temp_traget_df


    temp_test_df['loc_tuple'] = list(zip(temp_test_df.x, temp_test_df.y)) 
    test_df = target_cluster.merge(pd.DataFrame(temp_test_df['loc_tuple'].tolist(), columns=['x','y'])) 


    ######### target grid data #########
    target_cluster_list = target_df.index.values.tolist()
    target_grid, target_label = multi_grid.generate_grid(target_cluster_list, x_tr_blended, y_tr_blended)

    ftarget_grid = open(f'US_data/us-2011/target_california_grid_split/split-{idx}/target_grid_cal/target_cal.npy','wb')
    np.save(ftarget_grid, target_grid)
    ftarget_label = open(f'US_data/us-2011/target_california_grid_split/split-{idx}/target_grid_cal/target_cal_label.npy','wb')
    np.save(ftarget_label, target_label)

    ######### test grid data #########
    test_cluster_list = test_df.index.values.tolist()
    test_grid, test_label = multi_grid.generate_grid(test_cluster_list, x_tr_blended, y_tr_blended)

    ftest_grid = open(f'US_data/us-2011/target_california_grid_split/split-{idx}/test_grid_cal/test_cal.npy','wb')
    np.save(ftest_grid, test_grid)
    ftest_label = open(f'US_data/us-2011/target_california_grid_split/split-{idx}/test_grid_cal/test_cal_label.npy','wb')
    np.save(ftest_label, test_label)

    ######### source grid data #########
    source_cluster_list = source_cluster.index.values.tolist()
    source_grid, source_label = multi_grid.generate_grid(source_cluster_list, x_tr_blended, y_tr_blended)

    fsource_grid = open(f'US_data/us-2011/target_california_grid_split/split-{idx}/source_grid_cal/source_cal.npy','wb')
    np.save(fsource_grid, source_grid)
    fsource_label = open(f'US_data/us-2011/target_california_grid_split/split-{idx}/source_grid_cal/source_cal_label.npy','wb')
    np.save(fsource_label, source_label)
    

In [82]:
if __name__=='__main__':
    data_path = 'US_data/BigUS/v10_170713_5x5_include_na_dataset.npz'
    label_path = "US_data/BigUS/v10_170713_5x5_include_na_label.npz"
    monitoring_df = pd.read_csv("US_Data/BigUS/us_monitoring.csv")
    
    x_tr_blended = np.load(data_path)['arr_0']
    y_tr_blended = np.load(label_path)['arr_0']
    print(x_tr_blended[0].shape)
    
    tag_names = _create_tags(28)
    multi_grid = MultipleGrid(5, "KMeans")    
    whole_cluster, coor_cluster = multi_grid.cluster_grids(tag_names, x_tr_blended, pd.Series(y_tr_blended))
#     print(coor_cluster)
    
# #     coor_cluster_x = coor_cluster[coor_cluster['cluster id'] == 4]
# #     plt.figure(figsize=(10,6))
# #     plt.scatter(coor_cluster['x'], coor_cluster['y'], 10, coor_cluster['cluster id'])
# #     plt.show()
    
#     _plot_grid_clusters(coor_cluster) ### plotting the clusters (target-source: train and test)
    train_test_grids, train_test_data_id = multi_grid.split_train_test(tag_names, x_tr_blended, whole_cluster)
    create_TL_targetsource_split(monitoring_df, train_test_grids, x_tr_blended, y_tr_blended, 2)
    
    ##### get source-target-test clusters #####
# #     source_cluster, target_cluster, test_cluster = _get_train_test(train_test_grids, 2)
#     source_cluster, target_cluster = _get_target_source(train_test_grids, 2)
#     print(target_cluster.shape)
    
#     target_cluster_list = target_cluster.index.values.tolist()
#     target_grid, target_label = multi_grid.generate_grid(target_cluster_list, x_tr_blended, y_tr_blended)
#     print(target_grid.shape)
    
# #     test_cluster_list = test_cluster.index.values.tolist()
# #     test_grid = multi_grid.generate_grid(test_cluster_list, x_tr_blended)
# #     print(test_grid.shape)
    
#     source_cluster_list = source_cluster.index.values.tolist()
#     source_grid, source_label = multi_grid.generate_grid(source_cluster_list, x_tr_blended, y_tr_blended)
#     print(source_grid.shape)

        
        ############################
        
#         test_cluster_list = test_cluster.index.values.tolist()
#         test_grid, test_label = multi_grid.generate_grid(test_cluster_list, x_tr_blended, y_tr_blended)
        
#         ftest_grid = open(f'US_data/us-2011/grids/grid_{i}/test_{i}.npy','wb')
#         np.save(ftest_grid, test_grid)
#         ftest_label = open(f'US_data/us-2011/grids/grid_{i}/test_{i}_label.npy','wb')
#         np.save(ftest_label, test_label)
        
#         ftest_cluster = open(f'US_data/us-2011/clusters/cluster_{i}/test_{i}.npy','wb')
#         np.save(ftest_cluster, test_grid)
#         ftest_label = open(f'US_data/us-2011/clusters/cluster_{i}/test_{i}_label.npy','wb')
#         np.save(ftest_label, test_label)
#         print(test_grid.shape)
#         _plot_train_test(train_test_grids, i)
        
        ############################

        

############################ Creating the split for source-target-test clusters ############################
#     for i in train_test_grids.keys():
#         source_cluster, target_cluster = _get_target_source(train_test_grids, i)
            
#         target_cluster_list = target_cluster.index.values.tolist()
#         target_grid, target_label = multi_grid.generate_grid(target_cluster_list, x_tr_blended, y_tr_blended)
        
#         ftarget_grid = open(f'US_data/us-2011/grids/grid_{i}/target_{i}.npy','wb')
#         np.save(ftarget_grid, target_grid)
#         ftarget_label = open(f'US_data/us-2011/grids/grid_{i}/target_{i}_label.npy','wb')
#         np.save(ftarget_label, target_label)
        
#         ftarget_cluster = open(f'US_data/us-2011/clusters/cluster_{i}/target_{i}.npy','wb')
#         np.save(ftarget_cluster, target_grid)
#         ftarget_label = open(f'US_data/us-2011/clusters/cluster_{i}/target_{i}_label.npy','wb')
#         np.save(ftarget_label, target_label)
        
#         print(target_grid.shape)

#         source_cluster_list = source_cluster.index.values.tolist()
#         source_grid, source_label = multi_grid.generate_grid(source_cluster_list, x_tr_blended, y_tr_blended)
        
#         fsource_grid = open(f'US_data/us-2011/grids/grid_{i}/source_{i}.npy','wb')
#         np.save(fsource_grid, source_grid)
#         fsource_label = open(f'US_data/us-2011/grids/grid_{i}/source_{i}_label.npy','wb')
#         np.save(fsource_label, source_label)
        
#         fsource_cluster = open(f'US_data/us-2011/clusters/cluster_{i}/source_{i}.npy','wb')
#         np.save(fsource_cluster, source_grid)
#         fsource_label = open(f'US_data/us-2011/clusters/cluster_{i}/source_{i}_label.npy','wb')
#         np.save(fsource_label, source_label)
        
#         print(source_grid.shape)
#         print("-----------------------------\n")



(700,)


In [164]:
df_monitoring = pd.read_csv("US_data/BigUS/us_monitoring.csv")
df_monitoring

Unnamed: 0,year,day,cmaq_x,cmaq_y,elev,emissi11_pm25,forest_cover,high,limi,local,...,narr_vgrd1815mb,narr_tmp30m,narr_pres2m,narr_pres10m,narr_pres30m,aod_value,pm25_value,gc_aod,pm25_value_k,month
0,2011,1,-2.256897e+05,3.587932e+05,35.77930,0.0,0.000000,0.000000,0.000000,873.759842,...,0.498815,294.425,101386.0,101282.0,101040.0,0.112333,8.6,0.021297,8.582339,1.0
1,2011,1,1.571739e+06,3.949590e+05,1.01104,0.0,0.000000,1050.015418,0.000000,807.793896,...,2.506630,298.217,101552.0,101482.0,101273.0,,7.9,0.107923,7.383566,1.0
2,2011,1,-1.291484e+05,4.863733e+05,2.43082,0.0,0.000000,0.000000,0.000000,1009.915135,...,0.496211,293.508,101352.0,101282.0,101073.0,0.027000,5.9,0.022637,5.900033,1.0
3,2011,1,1.430635e+06,4.666680e+05,3.87558,0.0,0.000000,0.000000,2030.156665,0.000000,...,3.678500,295.279,97185.8,97082.1,96839.6,,5.4,0.111803,7.488950,1.0
4,2011,1,1.573342e+06,4.646160e+05,3.35600,0.0,0.020202,0.000000,0.000000,2037.573293,...,3.912880,296.529,101586.0,101515.0,101273.0,,10.2,0.100897,7.398404,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249124,2011,365,-1.900296e+06,3.057078e+06,595.92000,0.0,0.970588,0.000000,0.000000,0.000000,...,3.819380,290.432,100123.0,99985.6,99776.6,,4.7,0.016803,10.765756,12.0
249125,2011,365,-1.791970e+06,3.056591e+06,758.28300,0.0,0.778547,0.000000,0.000000,0.000000,...,-0.599889,288.057,96756.0,96652.2,96410.0,,9.9,0.007157,9.456686,12.0
249126,2011,365,-1.948693e+06,3.092715e+06,3.58333,0.0,0.004456,0.000000,0.000000,0.000000,...,1.504280,288.724,102023.0,101886.0,101677.0,,3.9,0.032277,11.428495,12.0
249127,2011,365,-2.117447e+06,3.128790e+06,3.56685,0.0,0.100092,0.000000,0.000000,0.000000,...,1.819380,290.349,102189.0,102019.0,101843.0,,2.7,0.018143,12.663332,12.0


In [167]:
df_mon_x = df_monitoring[df_monitoring['cmaq_x'] == -1.80717547e+06]
# print(df_mon_x[df_mon_x['cmaq_y'] == 1.28267051e+06])
print(df_mon_x)

Empty DataFrame
Columns: [year, day, cmaq_x, cmaq_y, elev, emissi11_pm25, forest_cover, high, limi, local, is, pd, lon, lat, rid, elev_k, emissi11_pm25_k, emissi11_pm10_k, forest_cover_k, high_k, limi_k, local_k, is_k, pd_k, cmaq_id, nldas_pevapsfc, nldas_dlwrfsfc, nldas_dswrfsfc, nldas_cape, nldas_fpcsfc, nldas_pcpsfc, nldas_rh2m, nldas_tmp2m, nldas_vgrd10m, nldas_ugrd10m, nldas_pressfc, narr_dpt, narr_vis, narr_hpbl, narr_rh2m, narr_tmp2m, narr_ugrd10m, narr_vgrd10m, narr_rh30mb, narr_rh63mb, narr_rh96mb, narr_rh129mb, narr_rh1512mb, narr_rh1815mb, narr_tmp30mb, narr_tmp63mb, narr_tmp96mb, narr_tmp129mb, narr_tmp1512mb, narr_tmp1815mb, narr_ugrd30m, narr_ugrd30mb, narr_ugrd63mb, narr_ugrd96mb, narr_ugrd129mb, narr_ugrd1512mb, narr_ugrd1815mb, narr_vgrd30m, narr_vgrd30mb, narr_vgrd63mb, narr_vgrd96mb, narr_vgrd129mb, narr_vgrd1512mb, narr_vgrd1815mb, narr_tmp30m, narr_pres2m, narr_pres10m, narr_pres30m, aod_value, pm25_value, gc_aod, pm25_value_k, month]
Index: []

[0 rows x 78 column