In [149]:
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix

### 读取数据

In [137]:
# 获得特征
def get_data(filename):
    with open(filename, 'r') as f:
        X = f.read().split('\n')
        X = X[:-1]
        X_npy = np.ones([len(X), 10])
        for i, x in enumerate(X): # 遍历每一行
            for j, h in enumerate(x.split()[1:]): # 遍历每一行中每一列
                X_npy[i][j] = float(h)
    return X_npy

# 获得标签
def get_label(filename):
    with open(filename, 'r') as f:
        y = f.read().split('\n')
        y = y[:-1]
        y_npy = np.ones([len(y), ])
        for i, y_i in enumerate(y): # 遍历每一行
            y_npy[i] = int(y_i.split()[1])
    return y_npy

# 读取数据
train_file = '../data/trainset/train_para_input.txt'
train_label_file = '../data/trainset/train_output.txt'
test_file = '../data/testset/test_para_input.txt'
X_train = get_data(train_file)
y_train = get_label(train_label_file)
X_test = get_data(test_file)

### 缺失值处理

In [138]:
# 均值填充
def fill_data_mean(X_train, X_test):
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    return X_train, X_test

# 中位值填充
def fill_data_median(X_train, X_test):
    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    return X_train, X_test

# 众数填充
def fill_data_most(X_train, X_test):
    imputer = SimpleImputer(strategy='most_frequent')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    return X_train, X_test

# 取零填充
def fill_data_zero(X_train, X_test):
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    return X_train, X_test

# KNN 缺失值填充
def fill_data_knn(X_train, X_test):
    imputer = KNNImputer()
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    return X_train, X_test

# 缺失值填充
X_train_mean, X_test_mean = fill_data_mean(X_train, X_test)
X_train_median, X_test_median = fill_data_median(X_train, X_test)
X_train_most, X_test_most = fill_data_most(X_train, X_test) 
X_train_zero, X_test_zero = fill_data_zero(X_train, X_test)
X_train_knn, X_test_knn = fill_data_knn(X_train, X_test)

In [150]:
# 模型训练及评价
def lr_cf(X, y):
    # 拆分数据集
    # train_size = int(X.shape[0]*0.8)
    # X_train = X[:train_size]
    # X_test = X[train_size:]
    # y_train = y[:train_size]
    # y_test = y[train_size:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=99, shuffle=True)
    # 模型训练
    lr = LogisticRegression(random_state=0)
    lr.fit(X_train, y_train)
    pre = lr.predict(X_test)
    # 模型评估
    matrix = confusion_matrix(y_test, pre)
    H = matrix[0][0]
    M = matrix[0][1]
    F = matrix[1][0]
    CN = matrix[1][1]
    recall = H / (H + M)
    precision = H / (H + F)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

In [151]:
# lr_cf(X_train_mean, y_train)
# lr_cf(X_train_median, y_train)
lr_cf(X_train_most, y_train)
# lr_cf(X_train_zero, y_train)
# lr_cf(X_train_knn, y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9755930167717584

In [147]:
X_train_mean

array([[2.0259500e+02, 1.7007400e+22, 3.5754500e+12, ..., 2.6258710e+03,
        0.0000000e+00, 8.0000000e-01],
       [2.2351300e+02, 1.7071700e+22, 3.8818400e+12, ..., 2.7945070e+03,
        0.0000000e+00, 1.0000000e+00],
       [2.4694300e+02, 1.8685100e+22, 4.3308300e+12, ..., 2.8527170e+03,
        1.9380000e+00, 1.8860000e+00],
       ...,
       [2.2886500e+03, 6.9717800e+23, 5.4900200e+13, ..., 9.5006100e+03,
        4.3770000e+00, 3.7198002e+01],
       [2.4291280e+03, 7.1156800e+23, 5.9208500e+13, ..., 9.1546160e+03,
        4.4280000e+00, 3.7312000e+01],
       [2.4601400e+03, 7.1565900e+23, 5.9642400e+13, ..., 9.1641970e+03,
        4.5040000e+00, 3.6699001e+01]])