## logistic回归

### 首先查看数据，确定数据情况

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('spamtrain.csv',header = None)
test = pd.read_csv('spamtest.csv',header = None)

In [3]:
print(train.shape)
print(test.shape)
#查看一行数据
train.head(1)

(2760, 58)
(1841, 58)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1


In [4]:
#数据准备阶段,我们观察到数据之间尺度不一样，将数据作0-1标准化处理
def scale_feature(data):
    n = data.shape[1]
    for i in range(n):
        Max = max(data.iloc[:,i])
        Min = min(data.iloc[:,i])
        data.iloc[:,i] = data.iloc[:,i].apply(lambda x: (x-Min)/(Max-Min+0.0000000000001))
    return data

X_train = scale_feature(train.iloc[:,:-1]).values
y_train = train.iloc[:,57].values.reshape(-1,1)
X_test = scale_feature(test.iloc[:,:-1]).values
y_test = test.iloc[:,57].values.reshape(-1,1)
#我们已经将数据处理为想要的样子了，一行代表一个样本数据，一列代表一个feature

- 我们了解了数据的大小，也知道了训练集和测试集的大小
### 构造Logistic函数

In [8]:
class LogisticRegression():
    def __init__(self,alpha,lmbda,maxiter,epsilon):
        self.alpha = float(alpha) #学习率
        self.lmbda = float(lmbda) #正则化系数
        self.maxiter = int(maxiter)#最大迭代次数
        self.epsilon = 0.001#导数收敛阈值
        self.threshold = 0.5 #分类阈值
    def process_feature(self,X):
        return np.hstack((np.zeros(shape=(X.shape[0],1),dtype='float')+1,X))
    def sigmoid(self,X):
        return 1.0/(1+np.exp(-np.dot(X,self.theta)))
    def fit(self,X_,y):
        X = self.process_feature(X_)
        self.m = X.shape[0]
        self.n = X.shape[1]
        self.theta = np.zeros(shape=(self.n,1),dtype = 'float')
        self.probs = np.zeros(shape=(self.m,1),dtype = 'float')   
        for i in range(self.maxiter):
            self.probs = self.sigmoid(X)
            temp = self.theta
            temp[0,0] = 0
            delta0 = (self.lmbda/self.m)*temp
            delta1 = (1/self.m)*np.dot(X.T,(self.probs-y))
            delta = delta1+delta0
            self.theta -= self.alpha*delta
            loss_deriv = np.linalg.norm(delta)
            loss = (-(1/self.m)*(np.dot(y.T,np.log(self.sigmoid(X)))+ \
                                np.dot((1-y).T,np.log(1-self.sigmoid(X)))- \
                               self.lmbda/2*sum(np.square(self.theta)))).flatten()
            if loss_deriv<self.epsilon:
                break
            print("{}:{}".format(i,loss_deriv))
            print("{}:{}".format(i,loss))
    def predict(self,X_):
        X = self.process_feature(X_)
        probs = self.sigmoid(X)
        y_pred = pd.DataFrame(probs).apply(lambda x:x>self.threshold)  
        return y_pred

In [39]:
model=LogisticRegression(alpha=5,lmbda=0.5,maxiter=1000,epsilon=0.001)

In [40]:
model.fit(X_train,y_train)

0:0.17937565573280914
0:[ 0.6270465]
1:0.056302334057365566
1:[ 0.71186409]
2:0.2225422408079648
2:[ 0.62428282]
3:0.0895683553394884
3:[ 0.75200775]
4:0.27634275032539257
4:[ 0.63765519]
5:0.13272896240215393
5:[ 0.81127884]
6:0.3320993870205052
6:[ 0.66494462]
7:0.1723508171830367
7:[ 0.87289809]
8:0.3772036581740473
8:[ 0.69283883]
9:0.1996107187213665
9:[ 0.91666135]
10:0.40513742097579586
10:[ 0.7089956]
11:0.21410984392432422
11:[ 0.93689844]
12:0.4185447812646755
12:[ 0.71201512]
13:0.2200252082469776
13:[ 0.94007644]
14:0.42310301712563075
14:[ 0.70645086]
15:0.22125621653389058
15:[ 0.93389647]
16:0.42305123051619625
16:[ 0.69653122]
17:0.22011741444824634
17:[ 0.92320732]
18:0.42070055602929346
18:[ 0.68470699]
19:0.21776500510625998
19:[ 0.91046269]
20:0.41716401047450286
20:[ 0.6721804]
21:0.21474201987521244
21:[ 0.89680538]
22:0.41295672434458597
22:[ 0.6595081]
23:0.21130174152872452
23:[ 0.88275574]
24:0.408318392554402
24:[ 0.64694613]
25:0.20756579300682784
25:[ 0.868

256:[ 0.39032203]
257:0.06296291841451007
257:[ 0.39020586]
258:0.06291840247159038
258:[ 0.39009046]
259:0.06287409342275749
259:[ 0.38997587]
260:0.06283010070549293
260:[ 0.38986205]
261:0.062786325823998
261:[ 0.38974902]
262:0.0627428488128401
262:[ 0.38963674]
263:0.06269959660955687
263:[ 0.38952523]
264:0.0626566274775898
264:[ 0.38941448]
265:0.06261388719244942
265:[ 0.38930448]
266:0.06257141793557981
266:[ 0.38919521]
267:0.06252917940869823
267:[ 0.38908668]
268:0.06248720194248014
268:[ 0.38897888]
269:0.062445455515076714
269:[ 0.38887181]
270:0.06240396174614967
270:[ 0.38876545]
271:0.062362698184224914
271:[ 0.3886598]
272:0.06232168006250244
272:[ 0.38855486]
273:0.06228089049786859
273:[ 0.38845061]
274:0.062240340054184114
274:[ 0.38834706]
275:0.06220001593870327
275:[ 0.38824419]
276:0.0621599253115104
276:[ 0.38814201]
277:0.062120058381370385
277:[ 0.3880405]
278:0.062080419835240694
278:[ 0.38793965]
279:0.06204100208285257
279:[ 0.38783947]
280:0.062001808020

569:0.055781230760254666
569:[ 0.37392577]
570:0.055769502179235764
570:[ 0.37390548]
571:0.055757811194419964
571:[ 0.37388528]
572:0.05574615763428579
572:[ 0.37386517]
573:0.05573454132835585
573:[ 0.37384517]
574:0.05572296210719274
574:[ 0.37382525]
575:0.05571141980238655
575:[ 0.37380544]
576:0.05569991424655252
576:[ 0.37378571]
577:0.055688445273317976
577:[ 0.37376609]
578:0.05567701271731968
578:[ 0.37374655]
579:0.0556656164141921
579:[ 0.37372711]
580:0.055654256200563634
580:[ 0.37370776]
581:0.055642931914046345
581:[ 0.37368851]
582:0.055631643393230366
582:[ 0.37366934]
583:0.055620390477676335
583:[ 0.37365027]
584:0.055609173007907665
584:[ 0.37363129]
585:0.055597990825403995
585:[ 0.3736124]
586:0.05558684377259419
586:[ 0.37359359]
587:0.055575731692848775
587:[ 0.37357488]
588:0.055564654430473585
588:[ 0.37355626]
589:0.05555361183070263
589:[ 0.37353772]
590:0.055542603739691526
590:[ 0.37351927]
591:0.055531630004510245
591:[ 0.37350091]
592:0.0555206904731371

880:[ 0.37056403]
881:0.05336250538327546
881:[ 0.37055903]
882:0.05335747433768501
882:[ 0.37055405]
883:0.0533524551262854
883:[ 0.37054909]
884:0.053347447713292966
884:[ 0.37054415]
885:0.05334245206306974
885:[ 0.37053923]
886:0.05333746814012355
886:[ 0.37053434]
887:0.05333249590910642
887:[ 0.37052946]
888:0.05332753533481432
888:[ 0.37052461]
889:0.05332258638218638
889:[ 0.37051977]
890:0.053317649016303724
890:[ 0.37051496]
891:0.05331272320238926
891:[ 0.37051017]
892:0.05330780890580681
892:[ 0.3705054]
893:0.05330290609205941
893:[ 0.37050065]
894:0.0532980147267907
894:[ 0.37049592]
895:0.053293134775781785
895:[ 0.37049121]
896:0.053288266204952056
896:[ 0.37048652]
897:0.05328340898035817
897:[ 0.37048185]
898:0.05327856306819274
898:[ 0.3704772]
899:0.05327372843478445
899:[ 0.37047257]
900:0.05326890504659685
900:[ 0.37046796]
901:0.05326409287022738
901:[ 0.37046337]
902:0.05325929187240799
902:[ 0.3704588]
903:0.05325450202000202
903:[ 0.37045425]
904:0.05324972328

In [41]:
#下面计算准确率
float(1-sum(np.square(y_test-(model.predict(X_test)+0)).values)/1841)

0.8832156436719174

#创建分类器，定义为类，分别完成几项不同的功能
class LogisticRegressin():
    def __init__(self,theta,lmbda,maxiter,epsilon):
        self.theta = float(theta) #学习率
        self.lmbda = float(lmbda) #正则化系数
        self.maxiter = int(maxiter)#最大迭代次数
        self.epsilon = 0.001#导数收敛阈值
        self.threshold = 0.5 #分类阈值
                
#对feature作处理，增加一个为1的特征，将w和b归结为theta
    def process_feature(self,X):
        '''
        这里默认样本数为m， m=X.shape[0]
        这里因为需要增加一列作为X0=1的系数，因此对X作相应的处理
        生成一个m行，1列的值为1的数列，然后和X作水平拼接，hstack真的很形象，水平拼接
        '''
        return np.hstack(np.zeros(shape=(X.shape[0],1),dtype='float')+1,X)
    def sigmoid(self,X):
        return 1.0/(1+np.exp(-np.dot(X,self.theta)))
    #模型用参数取拟合 
    def fit(self,X_,y):
        X = self.process_feature(X_)
        m = X.shape[0]
        n = X.shape[1]
        self.theta = np.zeros(shape=(n,1),dtype = 'float')
        self.probs = np.zeros(shape=(m,1),dtype = 'float')   
        '''
        在GD中，一次迭代使用的样本量为m，样本总量为n。GD的定义可以总结为：
        - m=n时 （就是用了所有的样本）， 就是所谓的batch GD 
        - m=1  （就是用了其中的一个样本）， 就是所谓的SGD
        - 1<m<n， 就是 mini-batch GD.
        所以从expected loss的角度看， 其实batch GD, mini-batch GD, SGD都可以看成SGD的范畴，只不过区别在于每次取多少的样本了。
        ''' 
        #下面给出了批梯度下降的一个实现方式
        for i in range(self.maxiter):
            #利用初始化的theta值进行预测,计算出一次初始概率
            self.probs = self.sigmoid(np.dot(X,theta))
            #计算梯度下降的系数，特别的，因为theta[0]不参与正则化
            temp = self.theta
            temp[0,0] = 0
            delta0 = (self.lmbda/self.m)*temp
            delta1 = (1/self.m)*np.dot(X.T,(self.probs-y))
            delta = delta1+delta0
            self.theta-=self.alpha*delta
            #以上就完成了一次系数的梯度下降
            #下面设置终止迭代的条件,最优点（全局还是局部最优）为下降的幅度已经很小了
        '''
        当然不知道这是不是凸优化问题。弄懂“凸优化”
       迭代的收敛条件（基于训练集的）：
       1.当目标函数的值小于某个阈值认为收敛————最直接（阈值不好确定，基本上弃用）
       2.当目标函数的差值小于某一个阈值认为收敛————最常用
       3.当目标函数的导数值接近0的时候认为收敛————适用情况比较局限
       
       如果有验证集，可以一直迭代，每次的参数将通过验证集去验证其accuraccy，将每次迭代的参数保存起来
       选择最优的参数'''
             #这里选择第三种停止迭代的方式，当导数的L2范式接近0时，取得模型最优
            loss_deriv = np/linalg.norm(delta)
            #这段如此冗长，以后向量内积用@好了
            loss = -(1/self.m)*(np.dot(y.T,np.log(self.sigmoid(np.dot(X,theta))))+ \
                                np.dot((1-y).T,np.log(1-self.sigmoid(np.dot(X,theta))))- \
                               self.lmbda/2*sum(np.squre(theta)))
            if loss_deriv<self.epsilon:
                break
            print("{}:{}".format(i,loss_deriv))
            print("{}:{}".format(i,loss))
            
#预测概率值
    def predict(self,X_):
        X = self.process_feature(X_)
        probs = self.sigmoid(X)
        y_pred = probs.apply(lambda x:x>self.threshold )                       

- 在python中矩阵的内积可以用 @ 这个字符实现