In [1]:
# !pip install ucimlrepo
# 参考1：https://archive.ics.uci.edu/dataset/20/census+income
# 参考2：https://github.com/uci-ml-repo/ucimlrepo

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
census_income = fetch_ucirepo(id=20) 
  
# data (as pandas dataframes) 
data = census_income.data.original
print(data.shape)

(48842, 15)


In [3]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [4]:
# # metadata 
# census_income.metadata

In [5]:
# variable information 
census_income.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,,no
1,workclass,Feature,Categorical,Income,"Private, Self-emp-not-inc, Self-emp-inc, Feder...",,yes
2,fnlwgt,Feature,Integer,,,,no
3,education,Feature,Categorical,Education Level,"Bachelors, Some-college, 11th, HS-grad, Prof-...",,no
4,education-num,Feature,Integer,Education Level,,,no
5,marital-status,Feature,Categorical,Other,"Married-civ-spouse, Divorced, Never-married, S...",,no
6,occupation,Feature,Categorical,Other,"Tech-support, Craft-repair, Other-service, Sal...",,yes
7,relationship,Feature,Categorical,Other,"Wife, Own-child, Husband, Not-in-family, Other...",,no
8,race,Feature,Categorical,Race,"White, Asian-Pac-Islander, Amer-Indian-Eskimo,...",,no
9,sex,Feature,Binary,Sex,"Female, Male.",,no


fnlwgt (final weight) 是一种用于调整人口样本代表性的权重值。它的计算基于人口普查局的多个控制变量（年龄、性别、种族、州等）， 
通过多次迭代调整来确保数据的代表性和准确性。需要注意的是，fnlwgt 的相似性仅在同一州内有效，因为各州的采样概率不同，跨州之间的权重不可直接比较。

In [6]:
# Uncomment the following lines if you are using Windows!
import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import when, col

appName = "mmoe"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

# Create Spark Context with the new configurations rather than relying on the default one
sc = SparkContext.getOrCreate(conf=conf)

# You need to create SQL Context to conduct some database operations like what we will see later.
sqlContext = SQLContext(sc)

# If you have SQL context, you create the session from the Spark Context
spark = sqlContext.sparkSession.builder.getOrCreate()



In [7]:
df = spark.createDataFrame(data)
df.show(2, vertical = True)

-RECORD 0----------------------------
 age            | 39                 
 workclass      | State-gov          
 fnlwgt         | 77516              
 education      | Bachelors          
 education-num  | 13                 
 marital-status | Never-married      
 occupation     | Adm-clerical       
 relationship   | Not-in-family      
 race           | White              
 sex            | Male               
 capital-gain   | 2174               
 capital-loss   | 0                  
 hours-per-week | 40                 
 native-country | United-States      
 income         | <=50K              
-RECORD 1----------------------------
 age            | 50                 
 workclass      | Self-emp-not-inc   
 fnlwgt         | 83311              
 education      | Bachelors          
 education-num  | 13                 
 marital-status | Married-civ-spouse 
 occupation     | Exec-managerial    
 relationship   | Husband            
 race           | White              
 sex        

In [8]:
df.schema

StructType([StructField('age', LongType(), True), StructField('workclass', StringType(), True), StructField('fnlwgt', LongType(), True), StructField('education', StringType(), True), StructField('education-num', LongType(), True), StructField('marital-status', StringType(), True), StructField('occupation', StringType(), True), StructField('relationship', StringType(), True), StructField('race', StringType(), True), StructField('sex', StringType(), True), StructField('capital-gain', LongType(), True), StructField('capital-loss', LongType(), True), StructField('hours-per-week', LongType(), True), StructField('native-country', StringType(), True), StructField('income', StringType(), True)])

根据census_income.variables中missing_values这一列可知: workclass, occupation和native-country这三列存在缺失值，我们首先删除有缺失值的行

In [9]:
def check_distinct_values(col_name):
    column_values = df.select(col_name).distinct().collect()
    values = [row[col_name] for row in column_values]
    print(values)
    return values

In [10]:
check_distinct_values("workclass")
check_distinct_values("occupation")
check_distinct_values("native-country")

['Self-emp-not-inc', 'Local-gov', 'State-gov', 'Private', 'Without-pay', 'Federal-gov', 'Never-worked', 'NaN', '?', 'Self-emp-inc']
['Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Farming-fishing', 'Craft-repair', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Other-service', 'NaN', 'Tech-support', 'Machine-op-inspct', 'Armed-Forces', '?', 'Adm-clerical']
['Philippines', 'Germany', 'Cambodia', 'France', 'Greece', 'Taiwan', 'Ecuador', 'Nicaragua', 'Hong', 'Peru', 'India', 'China', 'Italy', 'Holand-Netherlands', 'Cuba', 'South', 'Iran', 'Ireland', 'Thailand', 'Laos', 'El-Salvador', 'Mexico', 'Guatemala', 'Honduras', 'Yugoslavia', 'NaN', 'Puerto-Rico', 'Jamaica', 'Canada', 'United-States', 'Dominican-Republic', 'Outlying-US(Guam-USVI-etc)', 'Japan', 'England', 'Haiti', 'Poland', 'Portugal', '?', 'Columbia', 'Scotland', 'Hungary', 'Vietnam', 'Trinadad&Tobago']


['Philippines',
 'Germany',
 'Cambodia',
 'France',
 'Greece',
 'Taiwan',
 'Ecuador',
 'Nicaragua',
 'Hong',
 'Peru',
 'India',
 'China',
 'Italy',
 'Holand-Netherlands',
 'Cuba',
 'South',
 'Iran',
 'Ireland',
 'Thailand',
 'Laos',
 'El-Salvador',
 'Mexico',
 'Guatemala',
 'Honduras',
 'Yugoslavia',
 'NaN',
 'Puerto-Rico',
 'Jamaica',
 'Canada',
 'United-States',
 'Dominican-Republic',
 'Outlying-US(Guam-USVI-etc)',
 'Japan',
 'England',
 'Haiti',
 'Poland',
 'Portugal',
 '?',
 'Columbia',
 'Scotland',
 'Hungary',
 'Vietnam',
 'Trinadad&Tobago']

发现缺失值以"?"和"NaN"形式存在

In [11]:
df = df.replace(["NaN", "?"], [None, None])
# 成功把"NaN"和"?"替换为None
check_distinct_values("workclass")

['Self-emp-not-inc', None, 'Local-gov', 'State-gov', 'Private', 'Without-pay', 'Federal-gov', 'Never-worked', 'Self-emp-inc']


['Self-emp-not-inc',
 None,
 'Local-gov',
 'State-gov',
 'Private',
 'Without-pay',
 'Federal-gov',
 'Never-worked',
 'Self-emp-inc']

In [12]:
# 去除所有含None的行
df = df.na.drop()
print(df.count()) # 从48842行降低到45222行

45222


接下来开始处理每一列的数据类型

In [13]:
df.summary().show(truncate=False, vertical=True)

-RECORD 0----------------------------
 summary        | count              
 age            | 45222              
 workclass      | 45222              
 fnlwgt         | 45222              
 education      | 45222              
 education-num  | 45222              
 marital-status | 45222              
 occupation     | 45222              
 relationship   | 45222              
 race           | 45222              
 sex            | 45222              
 capital-gain   | 45222              
 capital-loss   | 45222              
 hours-per-week | 45222              
 native-country | 45222              
 income         | 45222              
-RECORD 1----------------------------
 summary        | mean               
 age            | 38.547941267524656 
 workclass      | null               
 fnlwgt         | 189734.7343107337  
 education      | null               
 education-num  | 10.118460041572686 
 marital-status | null               
 occupation     | null               
 relationshi

In [14]:
# 我们要把"workclass", "education", 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income' 
# 这些为"string"的categorical data转换为数字
df.dtypes

[('age', 'bigint'),
 ('workclass', 'string'),
 ('fnlwgt', 'bigint'),
 ('education', 'string'),
 ('education-num', 'bigint'),
 ('marital-status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('sex', 'string'),
 ('capital-gain', 'bigint'),
 ('capital-loss', 'bigint'),
 ('hours-per-week', 'bigint'),
 ('native-country', 'string'),
 ('income', 'string')]

In [15]:
# 'sex'和income属于binary变量，所以我们要将其转化为0和1
# Female对应0，Male对应1
check_distinct_values("sex")
df = df.withColumn(
    "sex",
    when(col("sex")=="Female", 0) \
    .when(col("sex")=="Male", 1) \
    .cast("bigint")  # 将数据类型转换为 bigint
)
check_distinct_values("sex")

['Female', 'Male']
[0, 1]


[0, 1]

In [16]:
# 1表示收入>50K,0表示收入<=50K
check_distinct_values("income")
df = df.withColumn(
    "income",
    when(col("income").isin("<=50K", "<=50K."), 0) \
    .when(col("income").isin(">50K", ">50K."), 1) \
    .cast("bigint")  # 将数据类型转换为 bigint
)
check_distinct_values("income")

['<=50K.', '<=50K', '>50K', '>50K.']
[0, 1]


[0, 1]

In [17]:
# 因为我们的预测目标是: whether this person’s marital status is never married
# 所以matrial-status 也是binary变量
check_distinct_values("marital-status")
df = df.withColumn(
    "marital-status",
    when(col("marital-status")=='Never-married', 0).otherwise(1) \
    .cast("bigint")  # 将数据类型转换为 bigint
)
check_distinct_values("marital-status")

['Separated', 'Never-married', 'Married-spouse-absent', 'Divorced', 'Widowed', 'Married-AF-spouse', 'Married-civ-spouse']
[0, 1]


[0, 1]

In [18]:
# 对于education，因为存在education-num，所以可以直接删除education
df=df.drop("education")
check_distinct_values("education-num")

[7, 6, 9, 5, 1, 10, 3, 12, 8, 11, 2, 4, 13, 14, 15, 16]


[7, 6, 9, 5, 1, 10, 3, 12, 8, 11, 2, 4, 13, 14, 15, 16]

In [19]:
# 处理其他categorical数据
# 即处理:"workclass", "education", 'marital-status', 'occupation', 'relationship', 'race', 'native-country'
print(len(check_distinct_values("workclass")))
print(len(check_distinct_values("occupation")))
print(len(check_distinct_values("relationship")))
print(len(check_distinct_values("race")))
print(len(check_distinct_values("native-country")))

['Self-emp-not-inc', 'Local-gov', 'State-gov', 'Private', 'Without-pay', 'Federal-gov', 'Self-emp-inc']
7
['Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Farming-fishing', 'Craft-repair', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Other-service', 'Tech-support', 'Machine-op-inspct', 'Armed-Forces', 'Adm-clerical']
14
['Own-child', 'Not-in-family', 'Unmarried', 'Wife', 'Other-relative', 'Husband']
6
['Other', 'Amer-Indian-Eskimo', 'White', 'Asian-Pac-Islander', 'Black']
5
['Philippines', 'Germany', 'Cambodia', 'France', 'Greece', 'Taiwan', 'Ecuador', 'Nicaragua', 'Hong', 'Peru', 'India', 'China', 'Italy', 'Holand-Netherlands', 'Cuba', 'South', 'Iran', 'Ireland', 'Thailand', 'Laos', 'El-Salvador', 'Mexico', 'Guatemala', 'Honduras', 'Yugoslavia', 'Puerto-Rico', 'Jamaica', 'Canada', 'United-States', 'Dominican-Republic', 'Outlying-US(Guam-USVI-etc)', 'Japan', 'England', 'Haiti', 'Poland', 'Portugal', 'Columbia', 'Scotland', 'Hungary', 'Vietnam', 'Trinad

In [20]:
from pyspark.ml.feature import StringIndexer, StringIndexerModel
from pyspark.ml import Pipeline

# We don't have any ordinal variables. Only nominal variables
# first part : transform the columns to numeric
stage_1 = StringIndexer(inputCol= 'workclass', outputCol= 'workclass_index')
stage_2 = StringIndexer(inputCol= 'occupation', outputCol= 'occupation_index')
stage_3 = StringIndexer(inputCol= 'relationship', outputCol= 'relationship_index')
stage_4 = StringIndexer(inputCol= 'race', outputCol= 'race_index')
stage_5 = StringIndexer(inputCol= 'native-country', outputCol= 'native-country_index')

# setup the pipeline
pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5])

# fit the pipeline model and transform the data as defined
pipeline_model = pipeline.fit(df)
df = pipeline_model.transform(df)

In [21]:
def get_mapping(pipeline_model):
    for stage in pipeline_model.stages:
        if isinstance(stage, StringIndexerModel):
            print(f"Mapping for {stage.getInputCol()} -> {stage.getOutputCol()}:")
            mapping = list(zip(stage.labels, range(len(stage.labels))))
            for label, index in mapping:
                print(f"  {label} -> {index}")

In [22]:
# 打印对应关系
get_mapping(pipeline_model)

Mapping for workclass -> workclass_index:
  Private -> 0
  Self-emp-not-inc -> 1
  Local-gov -> 2
  State-gov -> 3
  Self-emp-inc -> 4
  Federal-gov -> 5
  Without-pay -> 6
Mapping for occupation -> occupation_index:
  Craft-repair -> 0
  Prof-specialty -> 1
  Exec-managerial -> 2
  Adm-clerical -> 3
  Sales -> 4
  Other-service -> 5
  Machine-op-inspct -> 6
  Transport-moving -> 7
  Handlers-cleaners -> 8
  Farming-fishing -> 9
  Tech-support -> 10
  Protective-serv -> 11
  Priv-house-serv -> 12
  Armed-Forces -> 13
Mapping for relationship -> relationship_index:
  Husband -> 0
  Not-in-family -> 1
  Own-child -> 2
  Unmarried -> 3
  Wife -> 4
  Other-relative -> 5
Mapping for race -> race_index:
  White -> 0
  Black -> 1
  Asian-Pac-Islander -> 2
  Amer-Indian-Eskimo -> 3
  Other -> 4
Mapping for native-country -> native-country_index:
  United-States -> 0
  Mexico -> 1
  Philippines -> 2
  Germany -> 3
  Puerto-Rico -> 4
  Canada -> 5
  El-Salvador -> 6
  India -> 7
  Cuba -> 8
  En

In [23]:
df=df.drop("workclass", 'occupation', 'relationship', 'race', 'native-country')
df.show(1,vertical=True)

-RECORD 0---------------------
 age                  | 39    
 fnlwgt               | 77516 
 education-num        | 13    
 marital-status       | 0     
 sex                  | 1     
 capital-gain         | 2174  
 capital-loss         | 0     
 hours-per-week       | 40    
 income               | 0     
 workclass_index      | 3.0   
 occupation_index     | 3.0   
 relationship_index   | 1.0   
 race_index           | 0.0   
 native-country_index | 0.0   
only showing top 1 row



In [24]:
# 下标为0,1,5,6,7对应的是5个连续特征，将他们移动到最后，方便处理(暂且把age也当做连续特征）
columns = df.columns
columns_reordered = columns[2:5] + columns[8:] + columns[0:2] + [columns[5]] + [columns[6]] + [columns[7]]
df = df.select(columns_reordered)
df.show(1,vertical=True)

-RECORD 0---------------------
 education-num        | 13    
 marital-status       | 0     
 sex                  | 1     
 income               | 0     
 workclass_index      | 3.0   
 occupation_index     | 3.0   
 relationship_index   | 1.0   
 race_index           | 0.0   
 native-country_index | 0.0   
 age                  | 39    
 fnlwgt               | 77516 
 capital-gain         | 2174  
 capital-loss         | 0     
 hours-per-week       | 40    
only showing top 1 row



拼接所有特征变为一个向量

In [25]:
from copy import deepcopy

feature_list=df.columns
print("feature_list:",feature_list)

feature_list1=deepcopy(feature_list)
feature_list1.remove("income")
feature_list1.remove("marital-status")
print("feature_list for exp1:",feature_list1)

feature_list2=deepcopy(feature_list)
feature_list2.remove("education-num")
feature_list2.remove("marital-status")
print("feature_list for exp2:",feature_list2)

feature_list: ['education-num', 'marital-status', 'sex', 'income', 'workclass_index', 'occupation_index', 'relationship_index', 'race_index', 'native-country_index', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
feature_list for exp1: ['education-num', 'sex', 'workclass_index', 'occupation_index', 'relationship_index', 'race_index', 'native-country_index', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
feature_list for exp2: ['sex', 'income', 'workclass_index', 'occupation_index', 'relationship_index', 'race_index', 'native-country_index', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']


In [26]:
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(inputCols=feature_list1, outputCol="vectorized_features_1")
df = vector_assembler.transform(df)

vector_assembler = VectorAssembler(inputCols=feature_list2, outputCol="vectorized_features_2")
df = vector_assembler.transform(df)

In [27]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.sql.types import ArrayType, DoubleType

# 转换 SparseVector 和 DenseVector 为普通 Python 列表
def convert_vector_to_list(vector):
    if isinstance(vector, DenseVector):
        return vector.toArray().tolist()  # 将 DenseVector 转换为列表
    elif isinstance(vector, SparseVector):
        return vector.toArray().tolist()  # 将 SparseVector 转换为列表
    else:
        return vector  # 如果不是向量类型，保持不变

# 注册 UDF
convert_vector_udf = udf(convert_vector_to_list, ArrayType(DoubleType()))  # 输出为数组类型

# 使用 withColumn 转换列
df = df.withColumn("vectorized_features_1", convert_vector_udf(df["vectorized_features_1"]))
df = df.withColumn("vectorized_features_2", convert_vector_udf(df["vectorized_features_2"]))

In [28]:
# toPandas()能自动把bigint转化为浮点数
panda_df_1=df.select("vectorized_features_1","income","marital-status").toPandas()
print(panda_df_1.iloc[0,0])
panda_df_1

[13.0, 1.0, 3.0, 3.0, 1.0, 0.0, 0.0, 39.0, 77516.0, 2174.0, 0.0, 40.0]


Unnamed: 0,vectorized_features_1,income,marital-status
0,"[13.0, 1.0, 3.0, 3.0, 1.0, 0.0, 0.0, 39.0, 775...",0,0
1,"[13.0, 1.0, 1.0, 2.0, 0.0, 0.0, 0.0, 50.0, 833...",0,1
2,"[9.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0, 38.0, 2156...",0,1
3,"[7.0, 1.0, 0.0, 8.0, 0.0, 1.0, 0.0, 53.0, 2347...",0,1
4,"[13.0, 0.0, 0.0, 1.0, 4.0, 1.0, 8.0, 28.0, 338...",0,1
...,...,...,...
45217,"[13.0, 1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 33.0, 245...",0,0
45218,"[13.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 39.0, 215...",0,1
45219,"[13.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 38.0, 374...",0,1
45220,"[13.0, 1.0, 0.0, 3.0, 2.0, 2.0, 0.0, 44.0, 838...",0,1


In [29]:
panda_df_2=df.select("vectorized_features_2", "education-num", "marital-status").toPandas()
print(panda_df_2.iloc[0,0])
panda_df_2

[1.0, 0.0, 3.0, 3.0, 1.0, 0.0, 0.0, 39.0, 77516.0, 2174.0, 0.0, 40.0]


Unnamed: 0,vectorized_features_2,education-num,marital-status
0,"[1.0, 0.0, 3.0, 3.0, 1.0, 0.0, 0.0, 39.0, 7751...",13,0
1,"[1.0, 0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 50.0, 8331...",13,1
2,"[1.0, 0.0, 0.0, 8.0, 1.0, 0.0, 0.0, 38.0, 2156...",9,1
3,"[1.0, 0.0, 0.0, 8.0, 0.0, 1.0, 0.0, 53.0, 2347...",7,1
4,"[0.0, 0.0, 0.0, 1.0, 4.0, 1.0, 8.0, 28.0, 3384...",13,1
...,...,...,...
45217,"[1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 33.0, 2452...",13,0
45218,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 39.0, 2154...",13,1
45219,"[1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 38.0, 3749...",13,1
45220,"[1.0, 0.0, 0.0, 3.0, 2.0, 2.0, 0.0, 44.0, 8389...",13,1


In [30]:
import os

# 判断目录是否存在，如果不存在则创建
if not os.path.exists("data/exp1"):
    os.makedirs("data/exp1")  
if not os.path.exists("data/exp2"):
    os.makedirs("data/exp2")  

In [31]:
# 30000条样本作为训练集，7500条样本作为验证集，7500条样本作为测试集
panda_df_1.to_csv("data/exp1/raw.csv", index=False)
panda_df_2.to_csv("data/exp2/raw.csv", index=False)

In [32]:
df.columns

['education-num',
 'marital-status',
 'sex',
 'income',
 'workclass_index',
 'occupation_index',
 'relationship_index',
 'race_index',
 'native-country_index',
 'age',
 'fnlwgt',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'vectorized_features_1',
 'vectorized_features_2']