# 读取原始数据

In [39]:
import pandas as pd
hu_data = pd.read_excel(r"C:\Users\12045\Desktop\2020-2022年中国江苏省土地经济调查统计数据集\原始数据\户数据（原始数据）.xlsx")
ground_data = pd.read_excel(r"C:\Users\12045\Desktop\2020-2022年中国江苏省土地经济调查统计数据集\原始数据\地块数据（原始数据）.xlsx")
village_data = pd.read_excel(r"C:\Users\12045\Desktop\2020-2022年中国江苏省土地经济调查统计数据集\原始数据\村数据（原始数据）.xlsx")

In [40]:
# 输出列数
num_columns1 = village_data.shape[1]
print("村数据有", num_columns1, "列")
num_columns2 = ground_data.shape[1]
print("地块数据有", num_columns2, "列")
num_columns3 = hu_data.shape[1]
print("户数据有", num_columns3, "列")

村数据有 56 列
地块数据有 206 列
户数据有 206 列


# 处理重复值

In [41]:
print("村数据重复值数:", village_data.duplicated().sum())
print("地块数据重复值数:", ground_data.duplicated().sum())
print("户数据重复值数:", hu_data.duplicated().sum())

村数据重复值数: 0
地块数据重复值数: 0
户数据重复值数: 0


# 删除明显无法使用的特征列:

## 数据集只有江苏省数据，故删除省份列；
市县村编码不公开无法对应，故删除市县村编码列；
可用行号作索引故删除农户编码和村内农户编码；
mc_chengbaogd，mx_chengbaogd，ms_chengbaogd三列和前三列重复，故删除。

In [42]:
village_data = village_data.drop(columns=['a', 'b', 'c', 'd', 'e'])
ground_data = ground_data.drop(columns=['a', 'b', 'c', 'd', 'e', 'f', 'hid', 'mc_chengbaogd', 'mx_chengbaogd', 'ms_chengbaogd'])
hu_data = hu_data.drop(columns=['a', 'b', 'c', 'd', 'e', 'f', 'hid', 'mc_chengbaogd', 'mx_chengbaogd', 'ms_chengbaogd'])

## 删除地块数据和户数据中缺失值占比超过90%的特征列,村数据特征较少且缺失值显然较少故不进行删除。

In [43]:
# 删除地块数据中缺失值占比超过90%的特征列
# 计算每一列的缺失值占比
missing_percentage = (ground_data.isnull().sum() / len(ground_data)) * 100

# 筛选出缺失值占比大于百分之90的列名
columns_with_high_missing_percentage = missing_percentage[missing_percentage > 90].index

ground_data = ground_data.drop(columns=columns_with_high_missing_percentage)

In [44]:
# 删除户数据中缺失值占比超过90%的特征列
# 计算每一列的缺失值占比
missing_percentage = (hu_data.isnull().sum() / len(hu_data)) * 100

# 筛选出缺失值占比大于百分之90的列名
columns_with_high_missing_percentage = missing_percentage[missing_percentage > 90].index

hu_data = hu_data.drop(columns=columns_with_high_missing_percentage)

# 处理异常值

## 处理地块数据和户数据中的分类数据中的非编码部分，将其转换为对应编码，用正则表达式匹配第一个数值，把原数据替换为匹配到的数值

In [45]:
# 输出地块数据分类数据中不止有编码的特征列
print("地块数据分类数据中不止有编码的特征列:")
# 输出非空且不全是数字的数据以及对应的列名
for col in ground_data.columns:
    non_numeric_values = ground_data[col].dropna().loc[ground_data[col].apply(lambda x: isinstance(x, str) and not x.isdigit())]
    if not non_numeric_values.empty:
        print("列名:", col)
        print(non_numeric_values.head(20))
        print("-------------------")


地块数据分类数据中不止有编码的特征列:
列名: cbdpodu
838    4丘陵
861    4丘陵
Name: cbdpodu, dtype: object
-------------------
列名: zrdpodu
97     4池塘
838    4丘陵
858    4水塘
Name: zrdpodu, dtype: object
-------------------
列名: cbdturang
514    4沙土黏土混合
608      4沙土黑土
792        4混合
Name: cbdturang, dtype: object
-------------------
列名: zrdturang
514    4沙土黏土混合
608      4沙土黑土
614        4白土
858        4水塘
Name: zrdturang, dtype: object
-------------------
列名: cbdyongtu201912
7         8a
516      1,2
838    12,10
Name: cbdyongtu201912, dtype: object
-------------------
列名: zrdyongtu201912
9           3,4
10          1,2
45          18竹
516         1,2
709         1,2
815    2,14,18鹅
Name: zrdyongtu201912, dtype: object
-------------------
列名: cbdyongtu202008
7         8a
516      1,2
585     1,14
657       8a
838    12,10
Name: cbdyongtu202008, dtype: object
-------------------
列名: zrdyongtu202008
45          18竹
516         1,2
585        1,14
657          8a
815    1,14,18鹅
Name: zrdyongtu202008, dtype: object


In [46]:
# 转换为只有编码
import re

# 定义一个函数来转换非空且不全是数字的数据
def convert_to_numeric(value):
    # 匹配数字和逗号
    pattern = re.compile(r'\d+')
    # 查找匹配的数字
    match = pattern.search(value)
    if match:
        return int(match.group())  # 返回第一个匹配到的数字
    else:
        return None  # 如果找不到数字，则返回 None

# 遍历每一列
for col in ground_data.columns:
    non_numeric_indices = ground_data[col].dropna().loc[ground_data[col].apply(lambda x: isinstance(x, str) and not x.isdigit())].index
    # 如果非数字值不为空
    if not non_numeric_indices.empty:
        # 对于每个非数字值，将其转换为数字
        for idx in non_numeric_indices:
            ground_data.at[idx, col] = convert_to_numeric(ground_data.at[idx, col])
print("转换完成")

转换完成


In [47]:
# 输出户数据分类数据中不止有编码的特征列
print("户数据分类数据中不止有编码的特征列:")
# 输出非空且不全是数字的数据
for col in hu_data.columns:
    non_numeric_values = hu_data[col].dropna().loc[hu_data[col].apply(lambda x: isinstance(x, str) and not x.isdigit())]
    if not non_numeric_values.empty:
        print("列名:", col)
        print(non_numeric_values.head(20))
        print("-------------------")


户数据分类数据中不止有编码的特征列:
列名: cbdpodu
434          .
4749       4丘陵
4819    4平地和坡地
4837       4丘陵
4950       4丘陵
5365       4水面
5518       4水塘
Name: cbdpodu, dtype: object
-------------------
列名: cbdturang
36          4水田
334         4水田
526         4水田
531         4水田
2798    4:1和2混合
2839      412都有
2860    4沙土黏土混合
3130        4混合
3153      4黑泥黄泥
3172      4沙土黑土
3233        4粪土
3879       4沙壤土
4593        4混合
4738        4泥土
5014       4混合土
5231       4不清楚
5334        4水田
5362        4红土
5427        4红土
5664        4红土
Name: cbdturang, dtype: object
-------------------
列名: cbdyongtu201912
19       8b
57       8a
74       8a
85       8a
93      2,3
99       8a
102      8b
132     1,6
153      8b
163    18草莓
165    18草莓
170      8b
183      8b
221    18转出
238      8a
288      8b
310    18茶叶
311    18转出
332    18转出
342     1,2
Name: cbdyongtu201912, dtype: object
-------------------
列名: cbdyongtu202008
19       8b
57       8a
74       8a
85       8a
102      8b
132     1,8
153      8b
163    18

In [48]:
# 转换为只有编码
import re

# 定义一个函数来转换非空且不全是数字的数据
def convert_to_numeric(value):
    # 匹配数字和逗号
    pattern = re.compile(r'\d+')
    # 查找匹配的数字
    match = pattern.search(value)
    if match:
        return int(match.group())  # 返回第一个匹配到的数字
    else:
        return None  # 如果找不到数字，则返回 None

# 遍历每一列
for col in hu_data.columns:
    non_numeric_indices = hu_data[col].dropna().loc[hu_data[col].apply(lambda x: isinstance(x, str) and not x.isdigit())].index
    # 如果非数字值不为空
    if not non_numeric_indices.empty:
        # 对于每个非数字值，将其转换为数字
        for idx in non_numeric_indices:
            hu_data.at[idx, col] = convert_to_numeric(hu_data.at[idx, col])
            
print("转换完成")

转换完成


## 处理村数据异常值：把连续性数据中z-score>3的数据当作异常值，并转换为空值

In [49]:
# 处理村数据异常值
import numpy as np
from scipy.stats import zscore

# 初始化一个空列表，用于存储异常值的索引
outlier_indices = []
# 初始化一个空字典，用于存储每一列中的异常值个数
outlier_counts = {}
# 遍历每一列
for col in village_data.columns:
    # 判断是否是常数列或者是quequanyear列
    if village_data[col].nunique() > 1 and col != 'quequanyear' and col != 'year':
        # 计算 z-score
        z_scores = zscore(village_data[col])
        # 找到 z-score 绝对值大于 3 的索引
        col_indices = np.where(np.abs(z_scores) > 3)[0]
        # 统计异常值个数
        count = len(col_indices)
        if count > 0:
            outlier_counts[col] = count
            # 将异常值的索引添加到列表中
            outlier_indices.extend(col_indices)

# 去除重复的索引
outlier_indices = list(set(outlier_indices))

# 输出异常值个数
for col, count in outlier_counts.items():
    print(f"列 {col} 中的异常值个数: {count}")

# 将异常值替换为 NaN
for idx in outlier_indices:
    for col in village_data.columns:
        if village_data[col].nunique() > 1 and col != 'quequanyear' and col != 'year':
            if np.abs(zscore(village_data[col])[idx]) > 3:
                village_data.at[idx, col] = np.nan


列 liuzhuandiarea 中的异常值个数: 2
列 zhuti500 中的异常值个数: 2
列 zhuti200 中的异常值个数: 4
列 zhuti100 中的异常值个数: 3
列 zhuti50 中的异常值个数: 1
列 zhutiwaicun 中的异常值个数: 3


## 处理地块数据和户数据异常值：由于地块数据和户数据样本真实情况偏离程度较大，所以把连续性数据中z-score>5.5的数据才当作异常值

In [50]:
# 处理地块数据
import numpy as np
from scipy.stats import zscore
# 初始化一个空列表，用于存储异常值的索引
outlier_indices = []
# 初始化一个空字典，用于存储每一列中的异常值个数
outlier_counts = {}
# 分类数据不遍历
no_selected_col = ["cbdfangwei", "zrdfangwei", "zjxingshi", "c214b", "cbdpodu", "zrdpodu", "cbdgaosulu",
                   "zrdgaosulu", "cbdturang", "zrdturang", "cbdguangai", "zrdguangai", "cbdfeili", "zrdfeili",
                   "cbdyongtu201912", "zrdyongtu201912", "cbdyongtu202008", "zrdyongtu202008", "cbdwrxiufu",
                   "zrdwrxiufu", "shifouzc", "cbdjinqin", "zrdjinqin", "cbdbutie", "zrdbutie", "cbdxietiao",
                   "zrdxietiao", "cbdhetong", "zrdhetong", "cbdjypt", "zrdjypt", "cbdjyptreason1",
                   "zrdjyptreason1", "cbdjyptreason2", "zrdjyptreason2", "cbdjiangqixian", "zrdjiangqixian",
                   "d301a", "d301b", "d311a", "d311b", "d312a", "d312b", "d313a", "d313b", "d314a", "d314b",
                   "d315a", "d315b", "d316a", "d316b", "d317a", "d317b", "d319a", "d319b", "d320a", "d320b",
                   "d321a", "d321b", "d322a", "d322b", "d323a", "d323b", "d324a", "d324b", "d325a", "d325b",
                   "d328a", "d328b", "d329a", "d329b", "d331a", "d331b", "d335a", "d335b", "d336a", "d336b",
                   "d337a", "d337b", "d338a", "d338b", "d339a", "d339b", "d340a", "d340b", "d341a", "d341b"
                   "cbdpodu","zrdpodu","cbdturang","zrdturang","cbdyongtu201912","zrdyongtu201912",
                   "cbdyongtu202008","zrdyongtu202008","cbdjiangqixian","cbdqixian","zrdqixian","d301a","d301b","d336a","d336b"]

# 遍历每一列
for col in ground_data.columns:
    if  col not in no_selected_col:
        # 计算 z-score
        z_scores = zscore(ground_data[col])
        # 找到 z-score 绝对值大于 5.5 的索引
        col_indices = np.where(np.abs(z_scores) > 5.5)[0]
        # 统计异常值个数
        count = len(col_indices)
        if count > 0:
            outlier_counts[col] = count
            # 将异常值的索引添加到列表中
            outlier_indices.extend(col_indices)

# 去除重复的索引
outlier_indices = list(set(outlier_indices))

# 输出异常值个数
for col, count in outlier_counts.items():
    print(f"列 {col} 中的异常值个数: {count}")

# 将异常值替换为 NaN
for idx in outlier_indices:
    for col in ground_data.columns:
        if ground_data[col].nunique() > 1 and col not in no_selected_col:
            if np.abs(zscore(ground_data[col])[idx]) > 5.5:
                ground_data.at[idx, col] = np.nan


列 chengbaogd 中的异常值个数: 5
列 zcgdarea 中的异常值个数: 2
列 zcqiyearea 中的异常值个数: 8
列 zchezuoshearea 中的异常值个数: 3
列 zccunjitiarea 中的异常值个数: 4
列 zcnonghuarea 中的异常值个数: 4
列 zcareahetong 中的异常值个数: 4
列 zcareaqixian 中的异常值个数: 3
列 jitizcarea 中的异常值个数: 3
列 jitizcareacgb 中的异常值个数: 3
列 zrdarea 中的异常值个数: 8
列 cychengbaogd 中的异常值个数: 3


In [51]:
# 处理户数据
import numpy as np
from scipy.stats import zscore
# 初始化一个空列表，用于存储异常值的索引
outlier_indices = []
# 初始化一个空字典，用于存储每一列中的异常值个数
outlier_counts = {}
# 分类数据不遍历
no_selected_col = ["cbdfangwei", "zrdfangwei", "zjxingshi", "c214b", "cbdpodu", "zrdpodu", "cbdgaosulu",
                   "zrdgaosulu", "cbdturang", "zrdturang", "cbdguangai", "zrdguangai", "cbdfeili", "zrdfeili",
                   "cbdyongtu201912", "zrdyongtu201912", "cbdyongtu202008", "zrdyongtu202008", "cbdwrxiufu",
                   "zrdwrxiufu", "shifouzc", "cbdjinqin", "zrdjinqin", "cbdbutie", "zrdbutie", "cbdxietiao",
                   "zrdxietiao", "cbdhetong", "zrdhetong", "cbdjypt", "zrdjypt", "cbdjyptreason1",
                   "zrdjyptreason1", "cbdjyptreason2", "zrdjyptreason2", "cbdjiangqixian", "zrdjiangqixian",
                   "d301a", "d301b", "d311a", "d311b", "d312a", "d312b", "d313a", "d313b", "d314a", "d314b",
                   "d315a", "d315b", "d316a", "d316b", "d317a", "d317b", "d319a", "d319b", "d320a", "d320b",
                   "d321a", "d321b", "d322a", "d322b", "d323a", "d323b", "d324a", "d324b", "d325a", "d325b",
                   "d328a", "d328b", "d329a", "d329b", "d331a", "d331b", "d335a", "d335b", "d336a", "d336b",
                   "d337a", "d337b", "d338a", "d338b", "d339a", "d339b", "d340a", "d340b", "d341a", "d341b"
                   "cbdpodu","zrdpodu","cbdturang","zrdturang","cbdyongtu201912","zrdyongtu201912",
                   "cbdyongtu202008","zrdyongtu202008","cbdjiangqixian","cbdqixian","zrdqixian","d301a","d301b","d336a","d336b",
                   "cbdpodu","zrdpodu","cbdturang","zrdturang","cbdyongtu201912","zrdyongtu201912","cbdyongtu202008",
                   "zrdyongtu202008","cbdjiangqixian","cbdqixian","zrdqixian","d301a","d301b","d331a","d336a","d339a"]

# 遍历每一列
for col in hu_data.columns:
    if  col not in no_selected_col:
        # 计算 z-score
        z_scores = zscore(hu_data[col])
        # 找到 z-score 绝对值大于 5.5 的索引
        col_indices = np.where(np.abs(z_scores) > 5.5)[0]
        # 统计异常值个数
        count = len(col_indices)
        if count > 0:
            outlier_counts[col] = count
            # 将异常值的索引添加到列表中
            outlier_indices.extend(col_indices)

# 去除重复的索引
outlier_indices = list(set(outlier_indices))

# 输出异常值个数
for col, count in outlier_counts.items():
    print(f"列 {col} 中的异常值个数: {count}")

# 将异常值替换为 NaN
for idx in outlier_indices:
    for col in hu_data.columns:
        if hu_data[col].nunique() > 1 and col not in no_selected_col:
            if np.abs(zscore(hu_data[col])[idx]) > 5.5:
                hu_data.at[idx, col] = np.nan


列 chengbaogd 中的异常值个数: 19
列 zcgdarea 中的异常值个数: 11
列 zcqiyearea 中的异常值个数: 48
列 zchezuoshearea 中的异常值个数: 34
列 zccunjitiarea 中的异常值个数: 24
列 zcnonghuarea 中的异常值个数: 11
列 zcareahetong 中的异常值个数: 12
列 zcareaqixian 中的异常值个数: 7
列 jitizcarea 中的异常值个数: 4
列 jitizcareacgb 中的异常值个数: 12
列 cychengbaogd 中的异常值个数: 24


# 处理缺失值

## 处理村数据缺失值

## 处理地块数据和户数据缺失值

# 输出预处理后文件

In [53]:
village_data

Unnamed: 0,zhandiarea,gengdiarea,quequanyear,quequanarea,jidongdiarea,chengbaodiarea,liuzhuandiarea,liangshiarea,liaohuangarea,nonghujyarea,...,totalliuzhuandiarea,totalzhuti500,totalzhuti200,totalzhuti100,totalzhuti50,liuzhuanlv,ratiozhuti500,ratiozhuti200,ratiozhuti100,ratiozhuti50
0,2092.00,2092.000,2018.0,2092.000,0.0,2092.000,1730.000,2092.000,0.0,362.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
1,5200.00,3367.000,1994.0,3367.000,120.0,3147.000,2661.000,1210.000,0.0,2225.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
2,7349.18,6400.000,2017.0,7112.830,,,4435.000,2100.000,,,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
3,832.00,832.000,2017.0,7.700,35.0,142.000,142.000,832.000,0.0,832.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
4,6979.50,4168.000,1998.0,2300.000,0.0,4168.000,4168.000,1516.000,300.0,1808.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,21685.50,7100.000,,5500.000,4000.0,500.000,1000.000,4900.000,300.0,4900.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
110,990.00,5795.000,2016.0,1400.000,2195.0,3600.000,200.000,2200.000,0.0,5100.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
111,4821.00,4321.000,201610.0,3821.050,8.0,2052.000,2269.000,4321.000,0.0,2052.00,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534
112,2782.00,2603.298,2016.0,1879.298,0.0,2603.298,1879.298,2603.298,0.0,1551.55,...,235507.8,83034.23,59586.7,43680.92,39141,0.448463,0.158117,0.113467,0.083179,0.074534


In [54]:
ground_data

Unnamed: 0,year,chengbaogd,chengbaogdks,jygdarea,jycbgdarea,jyczgdarea,jynhgdarea,jygdnum,jygdnum1,jygdnum5,...,totaljitizcareacgb,huliuzhuanlv,huliuzhuanqiye,huliuzhuanhzs,huliuzhuancjt,huliuzhuannh,hetonglv,qixianlv,jitilv,jiticgblv
0,2021,3.25,1.0,140.0,5.25,0.0,134.75,20.0,0.0,8.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
1,2021,6.00,1.0,450.0,6.00,0.0,444.00,100.0,0.0,50.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
2,2020,19.00,7.0,160.0,19.00,0.0,141.00,11.0,0.0,11.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
3,2020,5.70,3.0,3.4,2.40,0.0,1.00,4.0,1.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
4,2021,,2.0,600.0,600.00,0.0,0.00,2.0,0.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,2020,0.80,1.0,2.0,0.80,0.0,1.20,3.0,2.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
939,2020,1.00,3.0,4.0,1.00,0.0,3.00,5.0,4.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
940,2020,3.40,4.0,0.0,,,,0.0,,,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
941,2020,2.00,2.0,22.0,2.00,0.0,20.00,5.0,1.0,2.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838


In [55]:
hu_data

Unnamed: 0,year,chengbaogd,chengbaogdks,jygdarea,jycbgdarea,jyczgdarea,jynhgdarea,jygdnum,jygdnum1,jygdnum5,...,totaljitizcareacgb,huliuzhuanlv,huliuzhuanqiye,huliuzhuanhzs,huliuzhuancjt,huliuzhuannh,hetonglv,qixianlv,jitilv,jiticgblv
0,2021,3.25,1.0,140.00,5.25,0.0,134.75,20.0,0.0,8.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
1,2021,8.00,1.0,0.00,0.00,0.0,0.00,1.0,0.0,1.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
2,2021,6.00,1.0,450.00,6.00,0.0,444.00,100.0,0.0,50.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
3,2021,2.00,1.0,0.00,0.00,0.0,0.00,1.0,1.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
4,2021,6.35,6.0,6.85,6.85,0.0,0.00,6.0,0.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5918,2020,3.00,2.0,3.00,3.00,0.0,0.00,2.0,0.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
5919,2020,9.00,1.0,0.00,,,,0.0,,,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
5920,2020,9.00,1.0,0.00,0.00,0.0,0.00,0.0,0.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838
5921,2020,3.00,1.0,3.00,3.00,0.0,0.00,1.0,0.0,0.0,...,10732.82,0.473799,0.080269,0.044664,0.280577,0.582854,0.788508,0.595934,0.780118,0.693838


In [52]:
# 输出预处理后数据
village_data.to_csv(r"C:\Users\12045\Desktop\村数据(预处理后).csv", index=False)
ground_data.to_csv(r"C:\Users\12045\Desktop\地块数据(预处理后).csv", index=False)
hu_data.to_csv(r"C:\Users\12045\Desktop\户数据(预处理后).csv", index=False)