## step1：导入必要的库文件

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
import pandas as pd

## step2：确定文件路径
指定Income.data文件。

In [None]:
train_file_path='Income.data'

## step3：为文件添加表头
指定Income.data文件的表头（复制于process_data.py)。

In [None]:
header = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship","race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "label"]

## step4：变量划分并预处理
将Income.data文件读入，并消除前导空格。
若文件中出现缺失值（‘？’），则用NaN代替。
求出含有缺失值的行数。
若未超过总数的10%，则删去含有缺失值的行，否则用平均值填充。
根据是否是label（答案）将训练集划分成自变量（train_X）和因变量（train_Y)。

In [None]:
train_data=pd.read_csv(train_file_path,names=header,delimiter=',',skipinitialspace=True)##需要skipinitialspace=True来消除前导空格

train_data.replace("?", pd.NA, inplace=True)

missing_values_count =train_data[train_data.isnull().any(axis=1)].shape[0]
#print(missing_values_count)
if(missing_values_count/train_data.size<=0.1):
    print("未超过总数的10%，则删去含有缺失值的行")
    train_data.dropna(inplace=True)
else:
    # 使用均值填充
    train_data.fillna(train_data.mean(), inplace=True)
    print("超过总数的10%，则删用平均值填充")

#missing_values_count =train_data[train_data.isnull().any(axis=1)].shape[0]
#print(missing_values_count)
train_X=train_data.drop("label",axis=1)
train_Y=train_data["label"]

## step5：数据处理
将数据处理为可以训练的形式。
对因变量（train_Y），>50k的变量设为True，<=50的变量设为False。

In [None]:
print(train_Y)
train_Y = (train_Y == ">50K")
print(train_Y.value_counts())

对自变量（train_X）进行处理。

In [None]:
categorical_cols = train_X.select_dtypes(include='object').columns
train_X = pd.get_dummies(train_X, columns=categorical_cols)
print(train_X)

## step6：训练模型
将训练集的自变量和因变量输入给model，调用fit函数进行模型训练。

In [None]:
model = LogisticRegression()
result_model=model.fit(train_X,train_Y)

## step6：发现关系和刻画
调用coef_函数，获取coefficients系数

In [None]:
# 获取coefficients系数
coefficients = model.coef_

feature_names = train_X.columns if hasattr(train_X, 'columns') else [f'feature_{i}' for i in range(len(train_X[0]))]
for feature, coefficient in zip(feature_names, coefficients[0]):
    print(f"{feature}: {coefficient}")

调用summary()函数，获取summary，进行模型刻画

In [None]:
# 获取summary
model_summary = result_model.summary()
print(model_summary)