In [1]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 构造数据
iris = load_iris()
x,y = iris.data,iris.target
# 因为鸢尾花具有三个类别，4个特征，此处仅使用两个特征，并且删除一类别
x = x[y!=0,2:]
y = y[y!=0]
# 此时，y的标签为1和2，这里改成0和1，仅仅是习惯
y[y==1] = 0
y[y==2] = 1
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

# 第一题

In [3]:
class MyLogicRegress():
    def __init__(self, max_iter=1000, alpha=0.01, tol=0.001, n=10):
        self.max_iter = max_iter
        self.alpha = alpha
        self.tol = tol
        self.n = n

    def fit(self, x, y):
        ones = np.ones((x.shape[0], 1))
        x_ = np.concatenate((x, ones), axis=1)
        self.w = np.random.random((x_.shape[1]))
        for i in range(self.max_iter):
            m = len(x_)
            z = x_.dot(self.w)
            s = 1 / (1 + np.exp(-z))
            self.w += 1 / m * self.alpha *  x_.T.dot(y.T - s)                
        self.coef_ = self.w[:-1]
        self.intercept_ = self.w[-1]
            
    def decision_function(self, x):
        """
        输出属于标签1的概率
        """
        ones = np.ones((x.shape[0], 1))
        x = np.concatenate((x, ones), axis=1)
        z = x.dot(self.w)
        s = 1 / (1 + np.exp(-z))
        return s
        
    def predict(self, x):
        """
        输出属于类别的标签
        """
        s = self.decision_function(x)
        return np.array([1 if p >= 0.5 else 0 for p in s])
        pass
    
    def predict_proba(self, x):
        """
        输出属于标签0 和1 的概率
        """
        s = self.decision_function(x)
        return np.array([[1 - p, p] for p in s])
    
    def acc(self, x, y):
        y_hat = self.predict(x)
        return  round((y_hat == y).sum() / len(x), 3)

In [4]:
mlr = MyLogicRegress(20000, alpha=0.001)
mlr.fit(x_train, y_train)
acc_train = mlr.acc(x_train, y_train)
acc_test = mlr.acc(x_test, y_test)
acc_all = mlr.acc(x, y)
print(acc_train, acc_test, acc_all)
print(mlr.coef_, mlr.intercept_)

0.813 0.92 0.84
[-0.21611623  1.07867052] -0.6422525918268536


In [5]:
# 对比
lr = LogisticRegression()
lr.fit(x_train, y_train)
acc_train = lr.score(x_train, y_train)
acc_test = lr.score(x_test, y_test)
acc_all = lr.score(x, y)
print(acc_train, acc_test, acc_all)
lr.coef_, lr.intercept_

0.96 0.88 0.94


(array([[2.67225111, 2.13911835]]), array([-16.80595479]))

# 第 2 题

In [6]:
data = pd.read_csv("Titanic.csv")
display(data.head(2))
data.info()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
data['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [8]:
# Cabin 数据缺失的太多
data.drop(columns='Cabin', axis=1, inplace=True)

In [9]:
# 删除有空值的行
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 66.8+ KB


In [10]:
y = data['Survived']
x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
display(x.head(2))
# 处理哑变量
x = pd.get_dummies(x)
display(x.head(2))

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

In [12]:
# 使用自己的逻辑回归类
mlr = MyLogicRegress()
mlr.fit(x_train, y_train)
acc_train = mlr.acc(x_train, y_train)
acc_test = mlr.acc(x_test, y_test)
print(acc_train, acc_test)

0.751 0.742


In [13]:
# 对比
lr = LogisticRegression()
lr.fit(x_train, y_train)
acc_train = lr.score(x_train, y_train)
acc_test = lr.score(x_test, y_test)
print(acc_train, acc_test)

0.8014981273408239 0.8033707865168539
