In [92]:
import pandas as pd
import geopandas as gpd
pd.options.mode.chained_assignment = None
import warnings
# 忽略所有警告
warnings.filterwarnings("ignore")

In [None]:
## tree cover of each cluster exported by gee
df2010=pd.read_csv("../Data/Processed/treecover_fromgee/export2010.csv")
df2022=pd.read_csv("../Data/Processed/treecover_fromgee/export2022.csv")
## for cluster id, gee will export 'int' types,which means fips code '037' will be turned to '37'
df2022['cluster2022'] = df2022['cluster2022'].astype(str).apply(lambda x: x if x.startswith('111') else ('0'+x))
df2010['cluster2010'] = df2010['cluster2010'].astype(str).apply(lambda x: x if x.startswith('111') else ('0'+x))

In [95]:
df2010.head(1)

Unnamed: 0,COUNTYFP,areaall2010,areabare2010,areagrass2010,areatree2010,areaurban2010,cluster2010
0,37,7021.268,650.64,254.771,33.212,6082.645,372957000


In [106]:

housefhsz=gpd.read_parquet('../Data/Processed/buildings/housefhsz_addhousetypes.parquet')

len(housefhsz.query('newhouse==1'))

housefhsz.head(1)

Unnamed: 0,FID_Califo,WUICLASS_2,WUIFLAG202,GEOID,FHSZ_7Clas,COUNTYFP,geometry,newhouse,cluster15,cluster50,allhousecountbybuffer50,allhousecountbybuffer15,housingclass
0,3660,High_Dens_Interface,2.0,60730211022,SRA High,73,"POLYGON ((-116.18654 32.62078, -116.18654 32.6209, -116.18639 32.62089, -116.18639 32.62097, -116.18636 32.62097, -116.18637 32.62087, -116.1864 32.62087, -116.1864 32.62082, -116.18645 32.62083, -116.18645 32.62078, -116.18654 32.62078))",0.0,7338722000,736618000,290,170,very dense


In [None]:
def resolve_duplicates(group):
    '''
    In the results exported from GEE, few buildings belong to two clusters, which causes these buildings to have multiple tree cover values.
    The issue is caused by spatial topological errors in the Microsoft building footprints. 
    This leads to some buildings having both a large 10m buffer (outside the building boundary) 
    and a very small buffer (inside the building polygon). 
    The values associated with the very small buffer should be deleted. 
    '''
    if group['areaall2010'].isna().any() or group['areaall2010'].nunique() == 1:
        # 如果 areaall2010 中存在 NaN[newhouseafter2010] 或者 areaall2010 列中的值相同（且不为 NaN），删除 areaall2022 值较小的行
        index_to_drop = group['areaall2022'].idxmin()
    else:
        # 如果 areaall2010 列中的值不相同，删除 areaall2010 值较小的行
        index_to_drop = group['areaall2010'].idxmin()
        
    group = group.drop(index_to_drop)
    return group

def droprepeativeid(df):

    duplicate_rows = df[df.duplicated('FID_Califo', keep=False)]
    if len(duplicate_rows)==0:
        print('norepeative!')
        return df,0
    else:
        # 分组并应用函数解决重复行
        resolved_df = duplicate_rows.groupby('FID_Califo').apply(resolve_duplicates)
        # 将处理后的数据框与原数据框中的非重复部分合并
        non_duplicate_rows = df[~df.index.isin(duplicate_rows.index)]
        final_df = pd.concat([non_duplicate_rows, resolved_df])
        # 重置索引
        final_df.reset_index(drop=True, inplace=True)
        duplicate_rows = final_df[final_df.duplicated('FID_Califo', keep=False)]
        print('len of duplicate_rows',len(duplicate_rows))
    return final_df,len(duplicate_rows)

  

def joinclustertreecover_to_singlebuilding(df2010,df2022,housef):
    '''
    tree cover of each 10 m cluster exported by gee join buildings through unique id'cluster2010'or'cluster2022',
    so the tree cover of building within the same cluster is designated as the tree cover of this cluster
    '''
    df2010f,df2022f=df2010.drop(columns=['COUNTYFP']),df2022.drop(columns=['COUNTYFP'])
    ## 2010和2022的10m防护区cluster
    cluster2010=gpd.read_parquet('../Data/Processed/buildings/cluster/2010buffer10mdiffhouse_cluster.parquet')\
    .drop(columns=['COUNTYFP'],errors='ignore')
    cluster2022=gpd.read_parquet('../Data/Processed/buildings/cluster/2022buffer10mdiffhouse_cluster.parquet')\
    .drop(columns=['COUNTYFP'],errors='ignore')
    print('2010',len(df2010f),len(cluster2010))
    print('2022',len(df2022f),len(cluster2022))
    ## 主要是找到gee导出的每一个cluster所属的geometry boundary through merging variable cluster2010.The '.geo' property exported by gee has a slight difference with local wgs4326.
    df2010f=df2010f.merge(cluster2010,on='cluster2010',how='left')
    gdf2010f = gpd.GeoDataFrame(df2010f, geometry='geometry', crs='epsg:4326')

    df2022f=df2022f.merge(cluster2022,on='cluster2022',how='left')
    gdf2022f = gpd.GeoDataFrame(df2022f, geometry='geometry', crs='epsg:4326')
    ## 指定每一个建筑在2010和2022的tree cover ratio, also whether it belongs to new or old building.
    sat2010f=gpd.sjoin(housef,gdf2010f,how='left').drop(columns=['index_right'], errors='ignore')
    print(sat2010f.columns)
    sat2022f=gpd.sjoin(housef ,gdf2022f[['geometry','cluster2022','areaall2022', 'areatree2022']],how='left')
    print(sat2022f.columns)
    ## 每一个建筑2010和2022的tree cover ratio
    sat1022f=sat2010f.merge(sat2022f[['FID_Califo','cluster2022','areaall2022', 'areatree2022']],on='FID_Califo',how='left').drop(columns=['index_right'], errors='ignore')
    sat1022f['cluster2010']= sat1022f['cluster2010'].fillna('newhouse-noclusterin2010')
    print(sat1022f.columns)
    ## Remove repeative buildingID caused by topological errors
    try:
        for i in range(100): 
            sat1022f, repet = droprepeativeid(sat1022f)
            if repet != 0:
                continue
            sat1022f.to_parquet('../Data/Processed/buildings/buildingswithtreecover/building_addhousetypes_treecover.parquet')
            return sat1022f            
        if i == 100:  
            print("Warning: Maximum iterations reached without resolving all duplicates")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    return sat1022f


s1020high=joinclustertreecover_to_singlebuilding(df2010,df2022,housefhsz)

2010 203354 203354
2022 209356 209356
Index(['FID_Califo', 'WUICLASS_2', 'WUIFLAG202', 'GEOID', 'FHSZ_7Clas',
       'COUNTYFP', 'geometry', 'newhouse', 'cluster15', 'cluster50',
       'allhousecountbybuffer50', 'allhousecountbybuffer15', 'housingclass',
       'areaall2010', 'areabare2010', 'areagrass2010', 'areatree2010',
       'areaurban2010', 'cluster2010'],
      dtype='object')
Index(['FID_Califo', 'WUICLASS_2', 'WUIFLAG202', 'GEOID', 'FHSZ_7Clas',
       'COUNTYFP', 'geometry', 'newhouse', 'cluster15', 'cluster50',
       'allhousecountbybuffer50', 'allhousecountbybuffer15', 'housingclass',
       'index_right', 'cluster2022', 'areaall2022', 'areatree2022'],
      dtype='object')
Index(['FID_Califo', 'WUICLASS_2', 'WUIFLAG202', 'GEOID', 'FHSZ_7Clas',
       'COUNTYFP', 'geometry', 'newhouse', 'cluster15', 'cluster50',
       'allhousecountbybuffer50', 'allhousecountbybuffer15', 'housingclass',
       'areaall2010', 'areabare2010', 'areagrass2010', 'areatree2010',
       'areau

In [110]:
s1020high.columns

Index(['FID_Califo', 'WUICLASS_2', 'WUIFLAG202', 'GEOID', 'FHSZ_7Clas',
       'COUNTYFP', 'geometry', 'newhouse', 'cluster15', 'cluster50',
       'allhousecountbybuffer50', 'allhousecountbybuffer15', 'housingclass',
       'areaall2010', 'areabare2010', 'areagrass2010', 'areatree2010',
       'areaurban2010', 'cluster2010', 'cluster2022', 'areaall2022',
       'areatree2022'],
      dtype='object')

## calcluate basic property 

In [115]:
def addCOUNTYname(df):
    county_names = ['San Luis Obispo', 'Santa Barbara', 'Ventura', 'Orange', 
                    'San Diego', 'Riverside', 'San Bernardino', 'Kern', 'Los Angeles']
    county_fips = ['079', '083', '111', '059', '073', '065', '071', '029', '037']

    # 创建从 FIPS 代码到县名的映射字典
    fips_to_name_map = dict(zip(county_fips, county_names))
    df['COUNTY'] = df['COUNTYFP'].map(fips_to_name_map)
    return df

In [116]:

def add_property(sat1022):
    sat10221=sat1022.assign(allhcount=sat1022.groupby('cluster2022')['FID_Califo'].transform(lambda x: x.nunique()),
    newhcount=sat1022.groupby('cluster2022')['newhouse'].transform(lambda x: (x ==1).sum()))
    sat10221['treeratio2010']=sat10221['areatree2010']/sat10221['areaall2010']
    sat10221['treeratio2022']=sat10221['areatree2022']/sat10221['areaall2022']
    sat10221['treeratiochange']=sat10221['treeratio2022']-sat10221['treeratio2010']
    sat10221=addCOUNTYname(sat10221)
    print(sat10221.columns)
    return sat10221
building_addhousetypes_treecover=add_property(s1020high)
building_addhousetypes_treecover.to_parquet('../Data/Processed/buildings/buildingswithtreecover/building_addhousetypes_treecover.parquet')

Index(['FID_Califo', 'WUICLASS_2', 'WUIFLAG202', 'GEOID', 'FHSZ_7Clas',
       'COUNTYFP', 'geometry', 'newhouse', 'cluster15', 'cluster50',
       'allhousecountbybuffer50', 'allhousecountbybuffer15', 'housingclass',
       'areaall2010', 'areabare2010', 'areagrass2010', 'areatree2010',
       'areaurban2010', 'cluster2010', 'cluster2022', 'areaall2022',
       'areatree2022', 'allhcount', 'newhcount', 'treeratio2010',
       'treeratio2022', 'treeratiochange', 'COUNTY'],
      dtype='object')
