# 作业4：线性模型的分布式算法

### 第1题

先利用如下代码生成模拟数据，并写入文件。数据中最后一列代表因变量 $Y$，其余列为自变量 $X$。

In [70]:
import numpy as np
np.set_printoptions(linewidth=100)

np.random.seed(123)
n = 100000
p = 100
x = np.random.normal(size=(n, p))
beta = np.random.normal(size=p)
y = 1.23 + x.dot(beta) + np.random.normal(scale=2.0, size=n)
dat = np.hstack((x, y.reshape(n, 1)))
np.savetxt("reg_data.txt", dat, fmt="%.8f", delimiter=";")

请以单机模式启动 PySpark，使用4个 CPU 核心，并编写分布式程序，实现如下计算：

In [73]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
# 单机模式
spark = SparkSession.builder.master("local[4]").appName("Reading Text").getOrCreate()
sc = spark.sparkContext
# sc.setLogLevel("ERROR")
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x000001FC82F47AC0>
<SparkContext master=local[4] appName=Reading Text>


1. 打印数据的前5行，并将每行的字符串截断至80个字符：

In [74]:
file1 = sc.textFile("reg_data.txt")

text = file1.map(lambda x: x[:80] + "…").take(5)
print(*text, sep="\n")

-1.08563060;0.99734545;0.28297850;-1.50629471;-0.57860025;1.65143654;-2.42667924…
0.64205469;-1.97788793;0.71226464;2.59830393;-0.02462598;0.03414213;0.17954948;-…
0.70331012;-0.59810533;2.20070210;0.68829693;-0.00630725;-0.20666230;-0.08652229…
0.76505485;-0.82898883;-0.65915131;0.61112355;-0.14401335;1.31660560;-0.70434215…
1.53409029;-0.52991410;-0.49097228;-1.30916531;-0.00866047;0.97681298;-1.7510703…


2. 将读取数据后得到的 RDD 按分区转为矩阵。使用默认分区数，无需重新分区。打印出转换后的第一个非空分区所包含的数据。

In [75]:
def str_to_vec(line):
    str_vec = line.split(";")
    num_vec = map(lambda s: float(s), str_vec) 
    return np.fromiter(num_vec, dtype=float)

def part_to_mat(iterator):	
	iter_arr = map(str_to_vec, iterator)
	dat = list(iter_arr)
	if len(dat) < 1: 
		mat = np.array([])
	else:
		mat = np.vstack(dat)
	yield mat

dat1 = file1.mapPartitions(part_to_mat).filter(lambda x: x.shape[0] > 0)
print(dat1.first())

[[ -1.0856306    0.99734545   0.2829785  ...   0.37940061  -0.37917643   3.72488966]
 [  0.64205469  -1.97788793   0.71226464 ...  -0.34126172  -0.21794626  10.98088055]
 [  0.70331012  -0.59810533   2.2007021  ...   0.16054442   0.81976061 -12.63028846]
 ...
 [ -0.30751248   0.1323937    2.33256448 ...   0.37475498  -1.37608098 -13.52353737]
 [ -0.02266014  -0.3014796    2.34502536 ...  -2.06082696  -1.20995417 -10.00714174]
 [  0.02415432  -0.3896902   -0.07492828 ...  -0.41935638  -1.68496516   8.33748658]]


3. 估计线性回归模型 $Y=X\beta+\varepsilon$ 的回归系数，**同时包含截距项**。要求**只使用一次** `reduce()`。

In [76]:
def insert_int(x):
    return np.insert(x,0,np.ones(x.shape[0]),axis=1)

dat1_int = dat1.map(insert_int)
xtxy = dat1_int.\
    map(lambda part: part[:, :-1].transpose().dot(part)).\
    reduce(lambda x, y: x + y)
beta_hat1 = np.linalg.solve(xtxy[:, :-1], xtxy[:, -1].reshape(p+1, 1))
beta_hat1

array([[ 1.22841355],
       [-0.58056172],
       [-1.12947488],
       [ 1.16031679],
       [ 0.68276231],
       [ 0.64063205],
       [-1.69803101],
       [ 0.87295008],
       [-0.6827681 ],
       [ 1.21323821],
       [-0.18532546],
       [-0.60313748],
       [ 0.45016343],
       [ 1.54732259],
       [ 0.93536575],
       [ 0.33661885],
       [-0.62839196],
       [-0.18223468],
       [ 1.04004336],
       [ 0.99530527],
       [-0.22421889],
       [ 0.26910036],
       [-1.95584105],
       [ 0.93200566],
       [-0.46663344],
       [-1.30308226],
       [-1.07451859],
       [-0.9200001 ],
       [-0.4751849 ],
       [-0.41498631],
       [ 0.0893936 ],
       [ 0.74250157],
       [ 0.44142653],
       [ 0.78310696],
       [ 0.0968675 ],
       [-0.20661749],
       [ 1.36408459],
       [-0.84452182],
       [-1.56303708],
       [-0.03391736],
       [ 0.05672465],
       [-0.01335776],
       [-0.31919022],
       [-1.7366497 ],
       [-1.35682179],
       [-1

4. 设计一个分布式算法，计算回归模型的 $R^2$。

In [77]:
rss, n, s, ss = dat1_int.\
    map(lambda x: (np.sum((x[:, :-1].dot(beta_hat1)-x[:, -1].reshape(x.shape[0], 1))*(x[:, :-1].dot(beta_hat1)-x[:, -1].reshape(x.shape[0], 1)), axis=0)[0], x.shape[0], np.sum(x[:, -1], axis=0), np.sum(x[:, -1]*x[:, -1], axis=0))).\
    reduce(lambda x, y:(x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3]))
mean = s / n
tss =  ss - n * mean * mean
r2 = 1 - rss / tss
r2

0.9654396241479573

### 第2题

(a) 考虑 Softplus 函数 $$\mathrm{softplus}(x)=\log(1+e^x)$$

请利用 Numpy 编写一个函数 `softplus(x)`，令其可以接收一个向量或矩阵 `x`，返回 Softplus 函数在 `x` 上的取值。

In [42]:
import numpy as np

def softplus(x):
    return np.log(1.0 + np.exp(x))

一个简单的测试：

In [43]:
x = np.array([-1000.0, -100.0, -10.0, 0.0, 1.0, 10.0, 100.0, 1000.0])

# 上面编写的函数
print(softplus(x))

[0.00000000e+00 0.00000000e+00 4.53988992e-05 6.93147181e-01
 1.31326169e+00 1.00000454e+01 1.00000000e+02            inf]


  return np.log(1.0 + np.exp(x))


(b) 上述结果是否正常？如果出现异常取值，思考可能的原因是什么，并参照课件上的说明再次尝试编写 Softplus 函数。注意尽可能使用 Numpy 提供的向量化函数，避免使用循环。该函数需同时支持向量和矩阵参数。如果一切正常，可忽略此问题。

In [44]:
def softplus(x):
    return np.where(x>=0.0, x+np.log(1.0+np.exp(-x)), np.log(1.0+np.exp(x)))

再次测试：

In [45]:
print(softplus(x))
print()
print(softplus(x.reshape(4, 2)))

[0.00000000e+00 0.00000000e+00 4.53988992e-05 6.93147181e-01
 1.31326169e+00 1.00000454e+01 1.00000000e+02 1.00000000e+03]

[[0.00000000e+00 0.00000000e+00]
 [4.53988992e-05 6.93147181e-01]
 [1.31326169e+00 1.00000454e+01]
 [1.00000000e+02 1.00000000e+03]]


  return np.where(x>=0.0, x+np.log(1.0+np.exp(-x)), np.log(1.0+np.exp(x)))


### 第3题

利用如下代码生成模拟数据，其中数据第一列代表0-1因变量 $Y$，其余列为自变量 $X$。

In [2]:
import numpy as np
from scipy.special import expit

np.random.seed(123)
n = 100000
p = 100
x = np.random.normal(size=(n, p))
beta = np.random.normal(size=p)
prob = expit(-0.5 + x.dot(beta))  # p = 1 / (1 + exp(-x * beta))
y = np.random.binomial(1, prob, size=n)
dat = np.hstack((y.reshape(n, 1), x))
np.savetxt("logistic_data.txt", dat, fmt="%.8f", delimiter="\t")

1. 对上述数据建立 Logistic 回归模型。任选一种算法，估计 Logistic 回归的回归系数，**同时包含截距项**。请利用第2题中编写的 Softplus 函数，编写**数值稳定**的函数计算 Logistic 回归的目标函数和梯度。

In [3]:
file2 = sc.textFile("logistic_data.txt")

text = file2.map(lambda x: x[:80] + "…").take(5)
print(*text, sep="\n")

0.00000000	-1.08563060	0.99734545	0.28297850	-1.50629471	-0.57860025	1.65143654	…
1.00000000	0.64205469	-1.97788793	0.71226464	2.59830393	-0.02462598	0.03414213	0…
0.00000000	0.70331012	-0.59810533	2.20070210	0.68829693	-0.00630725	-0.20666230	…
1.00000000	0.76505485	-0.82898883	-0.65915131	0.61112355	-0.14401335	1.31660560	…
0.00000000	1.53409029	-0.52991410	-0.49097228	-1.30916531	-0.00866047	0.97681298…


In [4]:
def str_to_vec(line):
    str_vec = line.split("\t")
    num_vec = map(lambda s: float(s), str_vec) 
    return np.fromiter(num_vec, dtype=float)

def part_to_mat(iterator):	
	iter_arr = map(str_to_vec, iterator)
	dat = list(iter_arr)
	if len(dat) < 1: 
		mat = np.array([])
	else:
		mat = np.vstack(dat)
	yield mat

dat2 = file2.mapPartitions(part_to_mat).filter(lambda x: x.shape[0] > 0)
print(dat2.first())

[[ 0.         -1.0856306   0.99734545 ... -1.36347154  0.37940061
  -0.37917643]
 [ 1.          0.64205469 -1.97788793 ... -0.11085072 -0.34126172
  -0.21794626]
 [ 0.          0.70331012 -0.59810533 ...  0.41569454  0.16054442
   0.81976061]
 ...
 [ 0.          1.04458986  1.14392554 ... -0.74881808  0.1943679
   1.79074125]
 [ 1.          0.57248339  0.59607894 ...  0.62412782 -0.67295588
   0.63768066]
 [ 1.          0.8264917   1.31059787 ... -1.46128427 -1.11978035
  -0.18945281]]


In [38]:
def sigmoid(x):
    e = np.exp(-np.abs(x))
    numer = np.where(x>=0.0, 1.0, e)
    denom = 1.0 + e
    return numer / denom

In [37]:
def compute_obj_grad(part_mat, beta_old):
    # 提取 X 和 y
    y = part_mat[:, 0]
    x = part_mat[:, 1:]
    # X * beta
    xb = x.dot(beta_old)
    # 目标函数：sum(y * log(prob) + (1 - y) * log(1 - prob))
    obj = -np.sum(y.dot(xb - softplus(xb)) + (1.0 - y).dot(-softplus(xb)))
    # 梯度： prob = rho(X * beta)    grad = X'(prob - y)
    prob = sigmoid(xb)
    grad = x.transpose().dot(prob - y)
    # 该分块的样本量
    ni = x.shape[0]
    return ni, obj, grad

In [39]:
import time
from scipy.optimize import minimize

def logistic_obj_grad(beta, *args): # 两个参数，被优化参数，额外参数，这里是rdd
	dat = args[0]
	n, objfn, grad = dat.map(lambda part: compute_obj_grad(part, beta)).\
		reduce(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]))
	objfn /= n
	grad /= n
	return objfn, grad # 返回目标函数值和梯度

In [40]:
def insert_int(x):
    return np.insert(x,1,np.ones(x.shape[0]),axis=1)

dat2_int = dat2.map(insert_int)
# 根据数据动态获取维度，不要使用之前模拟时的变量
p = dat2_int.first().shape[1] - 1
# beta初始化为0向量
beta_init = np.zeros(p)

t1 = time.time()
res = minimize(logistic_obj_grad, beta_init, args=(dat2_int,), method="L-BFGS-B", jac=True, options={"iprint": 1})
t2 = time.time() 
print(f"\nfinished in {t2 - t1} seconds")

beta_hat2 = res["x"]
beta_hat2


finished in 259.05640506744385 seconds


array([-0.52096407, -0.59177173, -1.10420282,  1.15465979,  0.67279837,
        0.63931167, -1.6821693 ,  0.86039376, -0.69835934,  1.22447244,
       -0.21061684, -0.60144532,  0.44210763,  1.57505206,  0.93504873,
        0.34281896, -0.63114241, -0.16737596,  1.0356489 ,  0.98851574,
       -0.21738375,  0.26607561, -1.95464866,  0.93399867, -0.44098173,
       -1.32384665, -1.06956504, -0.93362761, -0.4787895 , -0.40974567,
        0.13046112,  0.72410855,  0.43208806,  0.78064181,  0.12354026,
       -0.20115911,  1.34425446, -0.84670395, -1.57109919, -0.02173033,
        0.04200446,  0.01756201, -0.33733729, -1.74369454, -1.32739765,
       -1.60008818, -1.28377354,  0.93920707,  0.93256773, -0.84856473,
       -1.0869967 , -0.65544294, -1.52634818, -1.4603491 , -1.4154007 ,
        0.06736306, -2.06483804,  0.25381062, -1.44378436, -0.45925271,
       -1.12439444,  1.242744  ,  0.72114862,  0.46169417, -0.20588668,
        1.19789087, -0.17368835,  0.4262061 ,  0.4962246 , -0.29

2. 利用估计得到的 $\hat{\beta}$ 对原始数据进行预测，令 $\hat{\rho}_i$ 表示估计出的每个观测 $Y_i$ 取值为1的概率。为每个观测计算一个预测的0-1标签 $\hat{l}_i$，规则如下：如果 $\hat{\rho}_i\ge 0.5$，则 $\hat{l}_i=1$，反之 $\hat{l}_i=0$。利用分布式算法计算模型的预测准确度，即 $n^{-1}\sum_{i=1}^n I(Y_i=\hat{l}_i)$。$I(Y_i=\hat{l}_i)$ 表示预测对取1，预测错取0。

In [68]:
fore, n = dat2_int.\
    map(lambda x: (np.sum(np.where(np.random.binomial(1, sigmoid(x[:, 1:].dot(beta_hat2)), size=x.shape[0])==x[:, 0], 1, 0)), x.shape[0])).\
    reduce(lambda x, y:(x[0] + y[0], x[1] + y[1]))

indicator = fore / n

In [69]:
indicator

0.92622

In [71]:
sc.stop()