In [29]:
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [7]:
# 拉取数据
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
         'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", names=names)

In [42]:
# 随机5行
data.iloc[[random.randint(0, len(data)) for i in range(5)], ]

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
602,1350568,4,1,1,1,2,1,2,1,1,2
236,1241559,10,8,8,2,8,10,4,8,10,4
527,798429,4,1,1,1,2,1,3,1,1,2
467,1299596,6,6,6,5,4,10,7,6,2,4
563,1328755,3,1,1,1,2,1,2,1,1,2


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample code number           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [19]:
# 处理缺失值

data = data.replace(to_replace="?", value=np.nan)
data = data.dropna()

# 数据分割：
x_train, x_test, y_train, y_test = train_test_split(
    data[names[1:10]], data[names[10]], test_size=0.25)

In [20]:
# 标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)

In [21]:
lg = LogisticRegression(C=1.0)

In [22]:
lg.fit(x_train, y_train)

LogisticRegression()

In [23]:
# 回归系数
lg.coef_

array([[1.59375804, 0.49160839, 0.64700247, 0.94374764, 0.58568457,
        1.54039158, 0.94442355, 0.61239751, 0.76816075]])

In [24]:
# 准确率
lg.score(x_test, y_test)

0.9473684210526315

In [27]:
# 召回率
y_pre = lg.predict(x_test)
print(classification_report(y_test, y_pre,
                            labels=[2, 4], target_names=["良性", "恶性"]))

              precision    recall  f1-score   support

          良性       0.97      0.95      0.96       115
          恶性       0.90      0.95      0.92        56

    accuracy                           0.95       171
   macro avg       0.94      0.95      0.94       171
weighted avg       0.95      0.95      0.95       171

