In [1]:
import math
import datetime
import sys
import numpy as np

In [2]:
class LR:
    def __init__(self, train_file_name, test_file_name, predict_result_file_name):
        self.train_file = train_file_name
        self.predict_file = test_file_name
        self.predict_result_file = predict_result_file_name
#         self.max_iters = 300
#         self.rate = 0.1
        self.feats = []
        self.labels = []
        self.feats_test = []
        self.labels_predict = []
        self.param_num = 0
        self.weight = []

    def loadDataSet(self, file_name, label_existed_flag):
        feats = []
        labels = []
        fr = open(file_name)
        lines = fr.readlines()
        for line in lines:
            temp = []
            allInfo = line.strip().split(',')
            dims = len(allInfo)
            if label_existed_flag == 1:
                for index in range(dims-1):
                    temp.append(float(allInfo[index]))
                feats.append(temp)
                labels.append(float(allInfo[dims-1]))
            else:
                for index in range(dims):
                    temp.append(float(allInfo[index]))
                feats.append(temp)
        fr.close()
        feats = np.array(feats)
        labels = np.array(labels)
        return feats, labels

    def loadTrainData(self):
        self.feats, self.labels = self.loadDataSet(self.train_file, 1)

    def loadTestData(self):
        self.feats_test, self.labels_predict = self.loadDataSet(
            self.predict_file, 0)

    def savePredictResult(self):
        print(self.labels_predict)
        f = open(self.predict_result_file, 'w')
        for i in range(len(self.labels_predict)):
            f.write(str(self.labels_predict[i])+"\n")
        f.close()

    def sigmod(self, x):
        return 1/(1+np.exp(-x))

    def printInfo(self):
        print(self.train_file)
        print(self.predict_file)
        print(self.predict_result_file)
        print(self.feats)
        print(self.labels)
        print(self.feats_test)
        print(self.labels_predict)

    def initParams(self):
        self.weight = np.ones((self.param_num,), dtype=np.float)

    def compute(self, recNum, param_num, feats, w):
        return self.sigmod(np.dot(feats, w))

    def error_rate(self, recNum, label, preval):
        # 损失函数
        return np.power(label - preval, 2).sum()
        # cost = -np.mean(Y*np.log(A)+(1-Y)*np.log(1-A))
        # cost = (Y * np.log(A) - (1 - Y) * np.log(1 - A)).mean()
        # return (-1.0) * np.mean(np.multiply(Y, np.log(A)) + np.multiply(1.0-Y, np.log(1.0 - A)), axis=1)                                # compute cost

    def predict(self):
        self.loadTestData()
        preval = self.compute(len(self.feats_test),
                              self.param_num, self.feats_test, self.weight)
        self.labels_predict = (preval+0.5).astype(np.int)
        self.savePredictResult()

    def train_gd(self, max_iters, rate, error_i):
        # Gradient descent 梯度下降法
        self.loadTrainData()
        recNum = len(self.feats)
        self.param_num = len(self.feats[0])
        self.initParams()
        ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S,f'
        for i in range(max_iters):
            preval = self.compute(recNum, self.param_num,
                                  self.feats, self.weight)
            sum_err = self.error_rate(recNum, self.labels, preval)
            if i%error_i == 0:
                print("Iters:" + str(i) + " error:" + str(sum_err))
                theTime = datetime.datetime.now().strftime(ISOTIMEFORMAT)
                print(theTime)
            err = self.labels - preval
            delt_w = np.dot(self.feats.T, err)
            delt_w /= recNum
            self.weight += rate*delt_w

def print_help_and_exit():
    print("usage:python3 main.py train_data.txt test_data.txt predict.txt [debug]")
    sys.exit(-1)

def parse_args():
    debug = False
    if len(sys.argv) == 2:
        if sys.argv[1] == 'debug':
            print("test mode")
            debug = True
        else:
            print_help_and_exit()
    return debug

In [3]:
# if __name__ == "__main__":
#     debug = parse_args()
train_file =  "../data/train_data.txt"
test_file = "../data/test_data.txt"
predict_file = "../projects/student/result.txt"

In [4]:
lr = LR(train_file, test_file, predict_file)

## train max_iters=3000, rate=0.1

In [5]:
%%time
lr.train_gd(max_iters=3000, rate=0.1, error_i=300)

Iters:0 error:5051.0
2020-03-18 20:37:16,f
Iters:300 error:4474.116782414081
2020-03-18 20:37:18,f
Iters:600 error:4356.072640319351
2020-03-18 20:37:21,f
Iters:900 error:3696.0044602134813
2020-03-18 20:37:24,f
Iters:1200 error:3246.050114314364
2020-03-18 20:37:28,f
Iters:1500 error:2477.8451900303603
2020-03-18 20:37:31,f
Iters:1800 error:2235.9457306334252
2020-03-18 20:37:34,f
Iters:2100 error:2083.2450930199657
2020-03-18 20:37:37,f
Iters:2400 error:1941.1157276906101
2020-03-18 20:37:40,f
Iters:2700 error:1834.1491840540473
2020-03-18 20:37:43,f
CPU times: user 1min 1s, sys: 1.55 s, total: 1min 2s
Wall time: 33.6 s


lr.error_rate

In [6]:
lr.predict()

[0 1 0 ... 1 1 1]


In [7]:
#     if debug:
answer_file ="../projects/student/answer.txt"
f_a = open(answer_file, 'r')
f_p = open(predict_file, 'r')
a = []
p = []
lines = f_a.readlines()
for line in lines:
    a.append(int(float(line.strip())))
f_a.close()

lines = f_p.readlines()
for line in lines:
    p.append(int(float(line.strip())))
f_p.close()

In [8]:
print("answer lines:%d" % (len(a)))
print("predict lines:%d" % (len(p)))

answer lines:2000
predict lines:2000


In [9]:
errline = 0
for i in range(len(a)):
    if a[i] != p[i]:
        errline += 1

accuracy = (len(a)-errline)/len(a)
print("accuracy:%f" %(accuracy))

accuracy:0.681500


## train max_iters=300, rate=0.01

In [10]:
%%time
lr.train_gd(max_iters=300, rate=0.01, error_i=300)

Iters:0 error:5051.0
2020-03-18 20:37:50,f
CPU times: user 8.97 s, sys: 247 ms, total: 9.21 s
Wall time: 6.31 s


In [11]:
lr.predict()

[1 1 1 ... 1 1 1]


In [12]:
#     if debug:
answer_file ="../projects/student/answer.txt"
f_a = open(answer_file, 'r')
f_p = open(predict_file, 'r')
a = []
p = []
lines = f_a.readlines()
for line in lines:
    a.append(int(float(line.strip())))
f_a.close()

lines = f_p.readlines()
for line in lines:
    p.append(int(float(line.strip())))
f_p.close()

In [13]:
print("answer lines:%d" % (len(a)))
print("predict lines:%d" % (len(p)))

answer lines:2000
predict lines:2000


In [14]:
errline = 0
for i in range(len(a)):
    if a[i] != p[i]:
        errline += 1

accuracy = (len(a)-errline)/len(a)
print("accuracy:%f" %(accuracy))

accuracy:0.313000


## train max_iters=3000, rate=0.01

In [18]:
%%time
lr.train_gd(max_iters=3000, rate=0.01, error_i=300)

Iters:0 error:5051.0
2020-03-18 12:08:38,f
Iters:300 error:5050.999999949141
2020-03-18 12:08:41,f
Iters:600 error:1674.7238493279285
2020-03-18 12:08:44,f
Iters:900 error:1602.9710239762214
2020-03-18 12:08:47,f
Iters:1200 error:1542.4010528668537
2020-03-18 12:08:50,f
Iters:1500 error:1491.02371399723
2020-03-18 12:08:53,f
Iters:1800 error:1447.1673980905393
2020-03-18 12:08:56,f
Iters:2100 error:1409.4679135096148
2020-03-18 12:08:59,f
Iters:2400 error:1376.82774951144
2020-03-18 12:09:02,f
Iters:2700 error:1348.3687038066391
2020-03-18 12:09:05,f
CPU times: user 1min 4s, sys: 609 ms, total: 1min 4s
Wall time: 34.4 s


In [19]:
lr.predict()

[0 1 0 ... 0 0 0]


In [20]:
#     if debug:
answer_file ="../projects/student/answer.txt"
f_a = open(answer_file, 'r')
f_p = open(predict_file, 'r')
a = []
p = []
lines = f_a.readlines()
for line in lines:
    a.append(int(float(line.strip())))
f_a.close()

lines = f_p.readlines()
for line in lines:
    p.append(int(float(line.strip())))
f_p.close()

In [21]:
print("answer lines:%d" % (len(a)))
print("predict lines:%d" % (len(p)))

answer lines:2000
predict lines:2000


In [22]:
errline = 0
for i in range(len(a)):
    if a[i] != p[i]:
        errline += 1

accuracy = (len(a)-errline)/len(a)
print("accuracy:%f" %(accuracy))

accuracy:0.813500
