In [1]:
import math
import datetime
import sys
import numpy as np

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
def loadDataSet(file_name, label_existed_flag):
    feats = []
    labels = []
    fr = open(file_name)
    lines = fr.readlines()
    for line in lines:
        temp = []
        allInfo = line.strip().split(',')
        dims = len(allInfo)
        if label_existed_flag == 1:
            for index in range(dims-1):
                temp.append(float(allInfo[index]))
            feats.append(temp)
            labels.append(float(allInfo[dims-1]))
        else:
            for index in range(dims):
                temp.append(float(allInfo[index]))
            feats.append(temp)
    fr.close()
    feats = np.array(feats)
    labels = np.array(labels)
    return feats, labels

In [4]:
train_file =  "../data/train_data.txt"
test_file = "../data/test_data.txt"
predict_file = "../projects/student/result.txt"

In [5]:
%%time
X_train, y_train = loadDataSet(train_file, 1)

CPU times: user 3.26 s, sys: 207 ms, total: 3.46 s
Wall time: 3.46 s


In [6]:
%%time
X_test, _ = loadDataSet(test_file, 0)
answer_file = "../projects/student/answer.txt"
_, y_test = loadDataSet(answer_file, 1)

CPU times: user 795 ms, sys: 47.9 ms, total: 843 ms
Wall time: 842 ms


**Logistic Regression**

In [7]:
from sklearn.linear_model import LogisticRegression
import time
score_max_i_100 = []
times_max_i_100 = []
score_conv = []
times_conv = []
solvers = ["Newton-cg-> ","L-BFGS-> ", "SAG-> ","SAGA-> "]

Maximum Iterations = 100

In [8]:
#Trains model using Logistic Regression model
#Solver "newton-cg"
logisticRegr = LogisticRegression(random_state=0, solver='newton-cg')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_max_i_100.append(time.time()-start_time)

In [9]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_max_i_100.append(score)
print(score)

0.845


In [10]:
#Trains model using Logistic Regression model
#Solver "lbfgs"
logisticRegr = LogisticRegression(random_state=0, solver='lbfgs')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_max_i_100.append(time.time()-start_time)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_max_i_100.append(score)
print(score)

0.8445


In [12]:
#Trains model using Logistic Regression model
#Solver "sag"
logisticRegr = LogisticRegression(random_state=0, solver='sag')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_max_i_100.append(time.time()-start_time)

In [13]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_max_i_100.append(score)
print(score)

0.8435


In [14]:
#Trains model using Logistic Regression model
#Solver "saga"
logisticRegr = LogisticRegression(random_state=0, solver='saga')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_max_i_100.append(time.time()-start_time)

In [15]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_max_i_100.append(score)
print(score)

0.8435


In [16]:
for i in range(4):
    print(solvers[i],"Accuracy:",round(100*score_max_i_100[i],2)," Time: ",round(times_max_i_100[i],4))

Newton-cg->  Accuracy: 84.5  Time:  2.4787
L-BFGS->  Accuracy: 84.45  Time:  1.1379
SAG->  Accuracy: 84.35  Time:  2.1967
SAGA->  Accuracy: 84.35  Time:  4.2073


**Minimum Iterations for Convergeance**

In [17]:
#Trains model using Logistic Regression model
#Solver "newton-cg"
logisticRegr = LogisticRegression(random_state=0, max_iter=37, solver='newton-cg')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_conv.append(time.time()-start_time)

In [18]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_conv.append(score)
print(score)

0.845


In [20]:
#Trains model using Logistic Regression model
#Solver "lbfgs"
logisticRegr = LogisticRegression(random_state=0, max_iter=3045, solver='lbfgs')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_conv.append(time.time()-start_time)

In [21]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_conv.append(score)
print(score)

0.8455


In [22]:
#Trains model using Logistic Regression model
#Solver "sag"
logisticRegr = LogisticRegression(random_state=0, max_iter=689, solver='sag')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_conv.append(time.time()-start_time)

In [23]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_conv.append(score)
print(score)

0.8435


In [24]:
#Trains model using Logistic Regression model
#Solver "saga"
logisticRegr = LogisticRegression(random_state=0, max_iter=1009, solver='saga')
start_time = time.time()
logisticRegr.fit(X_train, y_train)
times_conv.append(time.time()-start_time)

In [25]:
#Calculating model accuracy
score = logisticRegr.score(X_test, y_test)
score_conv.append(score)
print(score)

0.8435


In [26]:
for i in range(4):
    print(solvers[i],"Accuracy:",round(100*score_conv[i],2)," Time: ",round(times_conv[i],4))

Newton-cg->  Accuracy: 84.5  Time:  2.4577
L-BFGS->  Accuracy: 84.55  Time:  1.6309
SAG->  Accuracy: 84.35  Time:  2.176
SAGA->  Accuracy: 84.35  Time:  4.1934


**Using Bigger Training Set**

#Splitting dataset 5% for test and 95% for training
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.05, random_state=0)

score_sag = []
score_saga = []

#Trains model using Logistic Regression model
#Solver "sag"
logisticRegr = LogisticRegression(random_state=0, solver='sag',multi_class='multinomial')
logisticRegr.fit(x_train, y_train)

#Calculating model accuracy
score = logisticRegr.score(x_test, y_test)
score_sag.append(score)
print(score)

#Trains model using Logistic Regression model
#Solver "sag"
logisticRegr = LogisticRegression(random_state=0, max_iter = 930, solver='sag',multi_class='multinomial')
logisticRegr.fit(x_train, y_train)

#Calculating model accuracy
score = logisticRegr.score(x_test, y_test)
score_sag.append(score)
print(score)

#Trains model using Logistic Regression model
#Solver "saga"
logisticRegr = LogisticRegression(random_state=0, solver='saga',multi_class='multinomial')
logisticRegr.fit(x_train, y_train)

#Calculating model accuracy
score = logisticRegr.score(x_test, y_test)
score_saga.append(score)
print(score)

#Trains model using Logistic Regression model
#Solver "saga"
logisticRegr = LogisticRegression(random_state=0, max_iter = 1173, solver='saga',multi_class='multinomial')
logisticRegr.fit(x_train, y_train)

#Calculating model accuracy
score = logisticRegr.score(x_test, y_test)
score_saga.append(score)
print(score)

print("SAG:")
print("Accuracy(without convergeance):",round(100*score_sag[0],2))
print("Accuracy(with convergeance):",round(100*score_sag[1],2))
print("SAGA:")
print("Accuracy(without convergeance):",round(100*score_saga[0],2))
print("Accuracy(with convergeance):",round(100*score_saga[1],2))