In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
import os
os.chdir("c://Users/uscxi/GitProjects/MachineLearningCourse/")
os.getcwd()

'c:\\Users\\uscxi\\GitProjects\\MachineLearningCourse'

In [15]:
df = pd.read_csv('dataset/smsspamcollection/SMSSpamCollection', delimiter='\t', header=None)

In [16]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
df[df[0]=='spam'][0].count(), df[df[0]=='ham'][0].count()

(747, 4825)

In [20]:
X = df[1].values
y = df[0].values

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y)

In [21]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

In [32]:
vectorizer.fit_transform(X_train_raw[:5])

<5x55 sparse matrix of type '<class 'numpy.float64'>'
	with 58 stored elements in Compressed Sparse Row format>

In [24]:
classifier = LogisticRegression(solver='lbfgs')
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
predictions = classifier.predict(X_test)
for i, prediction in enumerate(predictions[:5]):
    print('Predicted: %s, message: %s' % (prediction, X_test_raw[i]))

Predicted: ham, message: What time you coming down later? 
Predicted: ham, message: Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.
Predicted: ham, message: Where to get those?
Predicted: ham, message: Ok try to do week end course in coimbatore.
Predicted: ham, message: Jus finished avatar nigro


# 二. 对数几率回归（logit regression）
考虑一个二分类任务，其生产标记$y\in \{0,1\}$，而线性回归模型产生的预测值$z=\mathbf{\omega^Tx+b}$是实数，于是需将$z$转换为0/1值。直观地，可以考虑"单位阶跃函数"
$$
\begin{equation}
y=\begin{cases}
0,z<0;\\
0.5,z=0;\\
1,z>0.
\end{cases}
\end{equation}
$$
即若预测值$z$大于0则判为正例。显然，单位阶越函数是不连续函数，因此退而使用有更好兴致的对数几率函数（logistic function）:
$$
y=\frac{1}{1+e^{-z}}.
$$
显然对数几率函数可以将z值转换为一个接近0或1的值，且在$z=0$附近变化很陡。

In [None]:
%matplotlib inline
from sklearn.linear_model import LinearRegression, LogisticRegression
from torch.utils.data import TensorDataset, DataLoader
from IPython import display
import numpy as np
import torch
import matplotlib.pyplot as plt

In [None]:
def f(z):
    if z < 0:
        return 0
    elif z == 0:
        return 0.5
    else:
        return 1

def g(z):
    return 1/(1+np.exp(-z))

In [None]:
v_f = np.vectorize(f)
v_g = np.vectorize(g)

In [None]:
z = np.linspace(-5, 5, num=200)
y1 = v_f(z)
y2 = v_g(z)

In [None]:
display.set_matplotlib_formats('svg')
fig = plt.figure(figsize=(8, 4))
ax = fig.add_subplot(1, 1, 1)
ax.plot(z, y1, 'r-', label='Heaviside')
ax.plot(z, y2, 'g-', label='logit')
ax.scatter([0], [0.5], s=50, alpha=0.5)
ax.set_xlim([-5, 5])
ax.set_xlabel("z")
ax.set_ylabel("y")
ax.legend()

将$z=\mathbf{\omega^Tx+b}$代入对数几率函数，可得
$$
y=\frac{1}{1+e^{-\mathbf{\omega^Tx+b}}}.
$$
进而转换为
$$
\mathrm{ln}\frac{y}{1-y}=\mathbf{\omega^Tx+b}.
$$
若将$y$视为样本$\mathbf{x}$作为正例的可能性，则$1-y$是其反例可能性，两者比值为
$$
\frac{y}{1-y}
$$
称为几率（odd），反映了x作为正例的相对可能性。对几率取自然对数则可得对数几率（log odds, 也称为logit）
$$
\mathrm{ln}\frac{y}{1-y}
$$
通过“极大似然法”来估计$\omega$和$b$，给定数据集$\{(x_i,y_i)\}^m_{i=1}$，最大化对数似然率
$$
\text{max  } \mathbb{l(w,b)}=\sum_{i=1}^m \mathrm{ln}p(y_i|\mathbf{x_i;w,b})
$$
即令每个样本属于其真实标记的概率越大越好。上式又等价于最小化负对数似然率
$$
(\omega, b)^* = \text{argmin  } \mathbb{l(w,b)}=\sum_{i=1}^m\left(-y_i(\omega^Tx_i+b)+\mathbf{ln}(1+e^{\omega^Tx_i+b})\right)
$$

In [None]:
# sigmod函数
def logit(w, x, b):
    return 1 / (1 + torch.exp(x@w + b))

# 负对数似然率函数
def neglikelihood(x, y, w, b):
    z = x@w + b
    llike = -y.reshape(1, -1)@z + torch.sum(torch.log(1 + torch.exp(z)))
    return llike

In [None]:
true_w = torch.FloatTensor([0.2, 0.3]).reshape(-1, 1)
true_b = torch.FloatTensor([0.2])
x = torch.randn(size=(1000, 2)).float()

# 生成数据集
z = logit(true_w, x, true_b)  # 为正例的概率
y = z >= 0.5  # 生成0或1
y = y.float()  # 注意要转换为浮点数，否则后面迭代时报错

In [None]:
# 设置参数初始值
w = torch.rand(size=true_w.shape)
b = torch.FloatTensor([0.0])
w.requires_grad_(True)
b.requires_grad_(True)

lr = 0.01
num_epochs = 10
batch_size = 10  # 构建10个批次的训练集
dataset = TensorDataset(x, y)
data_iter = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    for x, y in data_iter:
        l = neglikelihood(x, y, w, b)        
        l.backward()  # 计算损失函数在 [w,b] 上的梯度
        w.data.sub_(lr*w.grad/batch_size)
        w.grad.data.zero_()
        b.data.sub_(lr*b.grad/batch_size)
        b.grad.data.zero_()
        
    with torch.no_grad():  # 不计算梯度，加速损失函数的运算
        train_l = neglikelihood(x, y, w, b)  # 最近一次的负对数似然率
        est_w = [x[0] for x in w.detach().numpy()]  # detach得到一个有着和原tensor相同数据的tensor
        est_b = [x for x in b.detach().numpy()]
        print(f'epoch {epoch + 1}, neglikelihood: {train_l.numpy()[0][0]:.4f}')
        print(f'    w0: {est_w[0]:.4f}, w1: {est_w[1]:.4f},  b: {est_b[0]:.4f}')