In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
X,y=make_classification(n_classes=2,n_features=5,n_samples=300)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

### 使用sklearn求解Logistic Regression

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
model=LogisticRegression(fit_intercept=True)
model.fit(X_train,y_train)
pre=model.predict(X_test)
print(model.coef_,model.intercept_)
print(classification_report(y_test,pre))

[[ 1.52148023  0.03638994 -0.18944455  0.38930781 -0.20435344]] [-0.16358819]
             precision    recall  f1-score   support

          0       0.84      0.84      0.84        43
          1       0.85      0.85      0.85        47

avg / total       0.84      0.84      0.84        90



### 使用Scipy求解Logistic Regression

In [3]:
from scipy.optimize import minimize
from scipy.special import expit
def lost(w,X,y,c=0.1):
    m,n=X.shape
    j1=np.sum(y*np.log(expit(np.dot(X,w)))/-m)
    j2=np.sum((1-y)*np.log(1-expit(np.dot(X,w)))/-m)
    j3=np.sum(np.square(w))*(c/2*m)
    return j1+j2+j3


class LogisticRegression:
    def __init__(self,c=0.1):
        self.c=c
    
    def fit(self,X,y):
        self.m,self.n=X.shape
        init_w=np.random.randn(self.n)
        res=minimize(lost,init_w,args=(X,y))
        self.res=res
        self.coef_=self.res.x
        
    def predict(self,X):
        prob=expit(np.dot(X,self.res.x.T)+np.sum(np.square(self.res.x))*(0.1/2*self.m))
        pre=np.array([1 if i>0.5 else 0 for i in prob])
        return pre
    
model=LogisticRegression()
model.fit(X_train,y_train)
res=model.predict(X_test)
print(classification_report(y_test,res))

             precision    recall  f1-score   support

          0       0.88      0.81      0.84        43
          1       0.84      0.89      0.87        47

avg / total       0.86      0.86      0.86        90



### 使用梯度下降法

In [4]:
class MyLogisticRegression:
    def __init__(self,alpha=0.007,maxCycles=1800):
        self.alpha=alpha
        self.maxCycles=maxCycles
    def fit(self,X,y):
        X_=np.full((X.shape[0],1),fill_value=1)   
        X=np.mat(np.concatenate([X,X_],axis=1))       
        y = np.mat(y).transpose()
        m,n = X.shape
        weights = np.mat(np.random.randn(n,1))
        for k in range(self.maxCycles):
            h = expit(X*weights)  
            error = h - y            
            weights = weights - self.alpha * X.transpose()* error
        self.coef_=np.array(weights).flatten()[:-1]
        self.intercept_=weights[-1]
        self.weights=weights
    def predict(self,X):
        X_=np.full((X.shape[0],1),fill_value=1)   
        X=np.concatenate([X,X_],axis=1)
        y_value=np.array(expit(np.mat(X)*self.weights)).flatten()
        y_pre=np.array([1 if i>0.5 else 0 for i in y_value])
        return y_pre

In [5]:
model=MyLogisticRegression(alpha=0.004,maxCycles=1000)
model.fit(X_train,y_train)
y_pre=model.predict(X_test)
print(classification_report(y_test,y_pre))

             precision    recall  f1-score   support

          0       0.84      0.84      0.84        43
          1       0.85      0.85      0.85        47

avg / total       0.84      0.84      0.84        90



##### 使用模型预测收入

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data=pd.read_csv('DecisionTree.csv')
X=data[['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country']]
y=[0 if i=='<=50K' else 1 for i in data['income'].tolist()]
X=pd.get_dummies(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
model=MyLogisticRegression()
model.fit(X_train.values,y_train)
y_pre=model.predict(X_test.values)
accuracy_score(y_test,y_pre)

1.0

### 使用随机梯度下降法

In [7]:
class MyLogisticRegression:
    def __init__(self,alpha=0.007,maxCycles=1800):
        self.alpha=alpha
        self.maxCycles=maxCycles
    def fit(self,X, y, numIter=150):
        X_=np.full((X.shape[0],1),fill_value=1)   
        X=(np.concatenate([X,X_],axis=1)) 
        m,n = X.shape
        weights = np.ones(n)   
        for j in range(numIter):
            dataIndex = list(range(m))
            for i in range(m):
                alpha = 4/(1.0+j+i)+0.0001   
                randIndex = int(np.random.uniform(0,len(dataIndex))) 
                h = expit(sum(X[randIndex]*weights))
                error = h-y[randIndex] 
                weights = weights - alpha * error * X[randIndex]
                del(dataIndex[randIndex])
        self.coef_=weights[:-1]
        self.intercept=weights[-1]
        self.weights=np.mat(weights).transpose()
        return weights
    def predict(self,X):
        X_=np.full((X.shape[0],1),fill_value=1)   
        X=(np.concatenate([X,X_],axis=1)) 
        y_value=np.array(expit(np.mat(X)*self.weights)).flatten()
        y_pre=np.array([1 if i>0.5 else 0 for i in y_value])
        return y_pre
    def predict_prob(self,X):
        X_=np.full((X.shape[0],1),fill_value=1)   
        X=(np.concatenate([X,X_],axis=1)) 
        y_value=np.array(expit(np.mat(X)*self.weights)).flatten()
        y_value=np.array(expit(np.mat(X)*self.weights)).flatten()
        return y_value

In [None]:
model=MyLogisticRegression()
model.fit(X_train,y_train)
pre=model.predict(X_test)

In [None]:
print(classification_report(y_test,pre))
