In [1]:
# https://xgboost.readthedocs.io/en/stable/tutorials/categorical.html
%matplotlib inline
import os, itertools, pickle
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf

from sklearn import impute, preprocessing


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

#import xgboost as xgb

# pip install statsmodels

RAWDATA_PATH = "../rawData/"

#raw_data = pd.read_csv(os.path.join(RAWDATA_PATH, "all_analysis_data.tsv"), sep="\t")
with open(os.path.join(RAWDATA_PATH, "analysis_data_no_scale.pickle"), "rb") as f:
    raw_data = pickle.load(f)

In [2]:
ap_d = {
    "PM25":['PM25_year_2015','PM25_year_2016','PM25_year_2017','PM25_year_2018'],
    "NO2":['NO2_year_2015','NO2_year_2016','NO2_year_2017','NO2_year_2018'],
    "NDVI_250":['NDVI_2015_250','NDV_2016_250','NDVI_2017_250','NDVI_2018_250'],
    "NDVI_500":['NDVI_2015_500','NDVI_2016_500','NDVI_2017_500','NDVI_2018_500'],
    "NDVI_1000":['NDVI_2015_1000','NDVI_2016_1000','NDVI_2017_1000','NDVI_2018_1000'],
    "PM10_wj":['wj_PM10_2014','wj_PM10_2015','wj_PM10_2016','wj_PM10_2017','wj_PM10_2018'],
    "O3_wj":['wj_O3_2014','wj_O3_2015','wj_O3_2016','wj_O3_2017','wj_O3_2018'],
}

raw_data["Ethnic"].replace({0:"Han", 1:"Tibetan", 2:"Dong", 3:"Bai", 4:"Yi", 5:"Buyi", 6:"Miao"}, inplace=True)

raw_data["Ethnic"] = raw_data["Ethnic"].astype(object)
tmp_data = raw_data[raw_data["Ethnic"]=="Han"].copy()

raw_data.loc[(raw_data["province_english"].isin(['Chongqing', 'Sichuan']))&(raw_data["Ethnic"]=="Han"), "Ethnic"] = "Basin_Han"
raw_data.loc[raw_data["province_english"].isin(['Yunnan'])&(raw_data["Ethnic"]=="Han"), "Ethnic"] = "Plateau_Han"

raw_data = pd.concat([raw_data, tmp_data], axis=0, ignore_index=True)
raw_data["Ethnic"] = raw_data["Ethnic"].astype("category")
raw_data["Ethnic"] = raw_data["Ethnic"].cat.set_categories(["Han", 'Basin_Han', 'Plateau_Han','Tibetan','Dong','Bai','Yi','Buyi','Miao'], ordered=True)

df = pd.DataFrame(columns=["Name", "year", "Ethnic", "Mean"])
for key, values in ap_d.items():
    for value in values:
        if value.startswith("t2m_mean"):
            item = value.split("_")
            ap_year = item[1][4:]
        elif value.startswith("wj_"):
            item = value.split("_")
            ap_year = item[2]
        elif value.startswith("NDVI"):
            item = value.split("_")
            ap_year = item[1]
        elif value.startswith("NDV_2016_250"):
            ap_year = "2016"
        else:
            item = value.split("_")
            ap_year = item[2]
            
        #df.loc[len(df)] = [key, ap_year, "All", raw_data[value].mean()]
        
        for ethnic in ["Han", 'Basin_Han', 'Plateau_Han','Tibetan','Dong','Bai','Yi','Buyi','Miao']:
            tmp_data = raw_data[raw_data["Ethnic"]==ethnic]
            df.loc[len(df)] = [key, ap_year, ethnic, tmp_data[value].mean()]
        
df = df.drop(index=df[df["year"]=="2014"].index)

In [3]:
for key in ap_d.keys():
    sns.set(font_scale=1.2,  rc={'figure.figsize':(10,6)}, style="ticks")
    g = sns.pointplot(data=df[df["Name"]==key], x="year", y="Mean", hue="Ethnic")

    g.spines.right.set_visible(False)
    g.spines.top.set_visible(False)
    g.legend(title="", bbox_to_anchor=(0.9,0.9), frameon=False)
    g.set(ylabel=key)
    plt.savefig("./plotFigure/{}-lineplot.pdf".format(key), format="pdf", bbox_inches="tight", transparent=True)
    plt.close()