In [1]:
import numpy as np
from collections import defaultdict

In [2]:
def get_data(data_path):
    with open(data_path, 'r') as f:
        lines = f.read().splitlines()
    X = []
    Y = []
    for line in lines:    # one doc
        single_dict = defaultdict(lambda:0)
        raw_X = line.split('<fff>')[-1]
        for pair in raw_X.split():
            idx = pair.split(':')[0]
            value = pair.split(':')[1]
            single_dict[int(idx)] = float(value)
            
        X.append(single_dict)
        Y.append(float(line.split('<fff>')[0]))
    return X, Y

In [3]:
X_train, Y_train = get_data("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs.txt")

In [4]:
X_test, Y_test = get_data("D:\\movedFromC\\123\\20192\\PRJ2\\Project2\\words_tfidfs_test.txt")

In [6]:
import scipy.sparse as sp

def convert_to_csr(X_dict, num_of_features):
    X_mat = sp.dok_matrix((len(X_dict), num_of_features), dtype=np.float)
    for i in range(len(X_dict)):
        for key in X_dict[i].keys():
            X_mat[i, key] = X_dict[i][key]
    X_mat = X_mat.tocsr()
    
    ones = np.array([[1] for _ in range(X_mat.shape[0])])
    X_mat_added_1 = sp.hstack([ones, X_mat]).tocsr()
    
    return X_mat_added_1

In [7]:
X_train_mat = convert_to_csr(X_train, 10272)
X_test_mat = convert_to_csr(X_test, 10272)

In [10]:
def create_Y_mat(Y, num_of_classes):
    Y = np.array(Y)
    Y_mat = np.zeros((len(Y), num_of_classes))
    for i in range(num_of_classes):
        Y_mat[:, i] = np.where(Y == i, 1, 0)
    
    return Y_mat

In [11]:
Y_train_mat = create_Y_mat(Y_train, 20)
Y_test_mat = create_Y_mat(Y_test, 20)

In [23]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


In [13]:
def J_reg(x, y, w, LAMBDA):    # for SGD
    z = sigmoid((x*w)[0][0])
    J = -(y * np.log(z) + (1-y) * np.log(1-z))
 #   return (J + LAMBDA * 1./2 * np.dot(w.transpose(), w))
    return J

In [244]:
def LogisticRegression_self(X, Y, w_init, lr, LAMBDA, tol = 1e-4, max_iter = 10000):
    w = [w_init]
   # w = w_init
    m = X.shape[0]
    d = X.shape[1]
    count = 0
    isBreak = 0
    check_w_after = 20
    while count < max_iter:
        # mix data
        arr = np.array(range(m))
        np.random.shuffle(arr)
        for i in arr:
            xi = X[i, :]
            yi = Y[i]
            zi = sigmoid(sp.csr_matrix.dot(xi, w[-1]))
       #     w_new = w[-1] - lr * (float(zi-yi) * xi.transpose() + LAMBDA * w[-1])
            # no L2 reg
            w_new = w[-1] - lr * (float(zi-yi) * xi.transpose())
            count += 1
            if count%check_w_after == 0:
                if np.linalg.norm(w_new - w[-check_w_after]) < tol:
                    return w_new, 1
                w.clear
            w.append(w_new)
            
    
    return w[-1],0

In [49]:
def find_the_best_LAMBDA(X_train, Y_train, lr):
    def cross_validation(num_folds, LAMBDA):
        row_ids = np.array(range(X_train.shape[0]))
        valid_ids = np.split(row_ids[: len(row_ids) - len(row_ids) % num_folds], num_folds)
        valid_ids[-1] = np.append(valid_ids[-1], row_ids[len(row_ids) - len(row_ids) % num_folds :])
        train_ids = [[k for f in row_ids if k not in valid_ids[i]] for i in range(num_folds)]
        
        aver_loss = 0
        for i in range(num_folds):
            valid_part = {'X': X_train[valid_ids[i]], 'Y': Y_train[valid_ids[i]]}
            train_part = {'X': X_train[train_ids[i]], 'Y': Y_train[train_ids[i]]}
            w = LogisticRegression_self(train_part['X'], train_part['Y'], lr, LAMBDA)
            aver_loss += J_reg(valid_part['X'], valid_part['Y'], w, LAMBDA)
        return aver_loss / num_folds
    
    def range_scan(best_LAMBDA, minimum_loss, LAMBDA_values):
        for curr_LAMBDA in LAMBDA_values:
            curr_loss = cross_validation(5, curr_LAMBDA)
            if (minimum_loss > curr_loss):
                best_LAMBDA = curr_LAMBDA
                minimum_loss = curr_loss
        return best_LAMBDA, minimum_loss
    
    best_LAMBDA, minimum_loss = range_scan(0, 1e8, range(50))
    LAMBDA_values = [k * 1. / 1000 for k in range(max(0, best_LAMBDA-1)*1000, (best_LAMBDA+1)*1000, 1)]
    best_LAMBDA, minimum_loss = range_scan(best_LAMBDA, minimum_loss, LAMBDA_values)
    return best_LAMBDA

In [253]:
import time

lr = 0.5
d = X_train_mat.shape[1]
res = []
t_start = time.time()

for i in range(20):
    w_init = np.random.randn(d,1)
    w, check = LogisticRegression_self(X_train_mat, Y_train_mat[:, i], w_init, lr, 1)
    res.append(w)
    
t_end = time.time()

In [254]:
t_end - t_start

314.6293170452118

In [255]:
check

0

In [256]:
res[0]

matrix([[-4.1381564 ],
        [ 0.6790387 ],
        [-0.37100309],
        ...,
        [ 0.82636725],
        [ 1.17523116],
        [-1.86885303]])

In [257]:
def predict(test_sample, label):
    probs = np.zeros((20, 2))
    for i in range(20):
        w_opt = res[i]
      #  w_opt= model.coef_[i]

        probs[i, 0] = i
        probs[i, 1] = float(sigmoid(sp.csr_matrix.dot(test_sample, w_opt)))
    probs = probs[probs[:, 1].argsort()[::-1]]
    
    #print("correct label is {}\n".format(label))
    #for i in range(20):
    #    print("{} - prob = {:.4f}".format(int(probs[i, 0]), probs[i, 1]))
    return int(probs[0, 0])

In [258]:
predict(X_train_mat[200], int(Y_train[0]))

0

In [259]:
def accuracy(X_mat, Y_dict):
    check_true = 0
    for i in range(X_mat.shape[0]):
        if predict(X_mat[i], Y_dict[i]) == int(Y_dict[i]):
            check_true += 1
    return (check_true * 100.0 / X_mat.shape[0])

In [260]:
print(accuracy(X_train_mat, Y_train))

79.79494431677567


In [261]:
print(accuracy(X_test_mat, Y_test))

65.08231545406267


In [108]:
from sklearn.linear_model import LogisticRegression

In [239]:
t = time.time()
model = LogisticRegression(solver = 'lbfgs', multi_class="multinomial", max_iter=10000)
model.fit(X_train_mat, Y_train)
time.time() - t


18.019732236862183

In [240]:
model.coef_[19]


array([ 0.0126037 ,  0.06227873, -0.03081572, ..., -0.00037624,
       -0.00093023,  0.        ])

In [262]:
from sklearn import metrics
# use the model to make predictions with the test data
y_pred = model.predict(X_train_mat)
# how did our model perform?
count_misclassified = (Y_train != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(Y_train, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 372
Accuracy: 0.97
