## Using Tensorflow to implement Logistic Regression model

In [1]:
%matplotlib inline

In [2]:
import numpy as np # linear algebra
import seaborn as sns
sns.set(style='whitegrid')
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf

 **Step 1: Read the data**

In [3]:
data = pd.read_csv('pulsar_stars.csv')

In [4]:
data.shape

(17898, 9)

In [5]:
data.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


I want to do a binary classification, so keep the first 100 rows of data

Iris-setosa species is linearly separable from the other two, but the other two are not linearly separable from each other.To keep the species blance
Iris-setosa and Iris-versicolor are choosen

In [6]:
data = data[:10000]

In [7]:
data.shape

(10000, 9)

**Step 2:  Numerical processing**

* replace 'Iris-setosa' as 0
* replace 'Iris-versicolor' as 1

In [8]:
# iris.Species = iris.Species.replace(to_replace=['Iris-setosa', 'Iris-versicolor'], value=[0, 1])

In [9]:
# plt.scatter(iris[:50].SepalLengthCm, iris[:50].SepalWidthCm, label='Iris-setosa')
# plt.scatter(iris[51:].SepalLengthCm, iris[51:].SepalWidthCm, label='Iris-versicolo')
# plt.xlabel('SepalLength')
# plt.ylabel('SepalWidth')
# plt.legend(loc='best')

In [10]:
X = data.drop(labels=['target_class'], axis=1).values
y = data['target_class']

**Step 3: Split data** 

* trainset: 80%
* testset: 20%

In [11]:
# set seed for numpy and tensorflow
# set for reproducible results
seed = 5
np.random.seed(seed)
tf.set_random_seed(seed)

In [12]:
# set replace=False, Avoid double sampling
train_index = np.random.choice(len(X), round(len(X) * 0.8), replace=False)

In [13]:
# diff set
test_index = np.array(list(set(range(len(X))) - set(train_index)))
train_X = X[train_index]
train_y = y[train_index]
test_X = X[test_index]
test_y = y[test_index]

In [14]:
# Define the normalized function
def min_max_normalized(data):
    col_max = np.max(data, axis=0)
    col_min = np.min(data, axis=0)
    return np.divide(data - col_min, col_max - col_min)

**Step 4: Normalized processing**

In [15]:
# Normalized processing, must be placed after the data set segmentation, 
# otherwise the test set will be affected by the training set
train_X = min_max_normalized(train_X)
test_X = min_max_normalized(test_X)

**Step 5: Build the model framework**

In [16]:
# Begin building the model framework
# Declare the variables that need to be learned and initialization
# There are 4 features here, A's dimension is (4, 1)
w = tf.get_variable("W",[8,1], initializer= tf.contrib.layers.xavier_initializer())
b = tf.get_variable("b",[1], initializer= tf.zeros_initializer())
# init = tf.global_variables_initializer()
# sess = tf.Session()
# sess.run(init)

In [17]:
# Define placeholders
X = tf.placeholder(dtype=tf.float32, shape=[None, 8])
Y = tf.placeholder(dtype=tf.float32, shape=[None, 1])

In [18]:
# Declare the model you need to learn
mod = tf.matmul(X, w)+b
prediction = tf.nn.sigmoid(mod)

In [19]:
# Declare loss function
# Use the sigmoid cross-entropy loss function,
# first doing a sigmoid on the model result and then using the cross-entropy loss function
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=mod, labels= Y))

In [20]:
# Define the learning rate， batch_size etc.
learning_rate = 0.02
iter_num = 150000

In [21]:
# Define the optimizer
opt = tf.train.AdadeltaOptimizer(learning_rate)

In [22]:
# Define the goal
goal = opt.minimize(loss)

In [23]:
init = tf.global_variables_initializer()
cost_history = np.empty(shape=[1], dtype = float)
# Define the accuracy
# The default threshold is 0.5, rounded off directly
# prediction = tf.round(tf.sigmoid(mod))
# Bool into float32 type
# correct = tf.cast(tf.equal(prediction, target), dtype=tf.float32)
# Average
# accuracy = tf.reduce_mean(correct)
# End of the definition of the model framework

In [24]:
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(iter_num):
        _, c = sess.run([goal, loss], feed_dict={X: train_X , Y: train_y})
        print("Epoch:", '%04d' % (epoch_1), "cost=", "{:.9f}".format(c), 'W=', sess.run(w), 'b=', sess.run(b))
        cost_history = np.append(cost_history, c)
            
    correct_prediction = tf.to_float(tf.greater(mod, 0.5))
        
    accuracy = tf.reduce_mean(tf.to_float(tf.equal(Y, correct_prediction)))
        
    print("Train accuracy:", accuracy.eval({X: train_X , Y: train_y}))
    print("Test accuracy:", accuracy.eval({X: test_X,Y: test_y}))

ValueError: Cannot feed value of shape (8000,) for Tensor 'Placeholder_1:0', which has shape '(?, 1)'

In [None]:
# Start training model
# # Define the variable that stores the result
# loss_trace = []
# train_acc = []
# test_acc = []

**Step 6: Model training**

In [None]:
# # training model
# for epoch in range(iter_num):
#     # Generate random batch index
#     batch_index = np.random.choice(len(train_X), size=batch_size)
#     batch_train_X = train_X[batch_index]
#     batch_train_y = np.matrix(train_y[batch_index]).T
#     sess.run(goal, feed_dict={data: batch_train_X, target: batch_train_y})
#     temp_loss = sess.run(loss, feed_dict={data: batch_train_X, target: batch_train_y})
#     # convert into a matrix, and the shape of the placeholder to correspond
#     temp_train_acc = sess.run(accuracy, feed_dict={data: train_X, target: np.matrix(train_y).T})
#     temp_test_acc = sess.run(accuracy, feed_dict={data: test_X, target: np.matrix(test_y).T})
#     # recode the result
#     loss_trace.append(temp_loss)
#     train_acc.append(temp_train_acc)
#     test_acc.append(temp_test_acc)
#     # output
#     if (epoch + 1) % 300 == 0:
#         print('epoch: {:4d} loss: {:5f} train_acc: {:5f} test_acc: {:5f}'.format(epoch + 1, temp_loss,
#                                                                           temp_train_acc, temp_test_acc))

**Step 7: Visualization**

In [None]:
# Visualization of the results
# loss function
plt.plot(loss_trace)
plt.title('Cross Entropy Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

In [None]:
# accuracy
plt.plot(train_acc, 'b-', label='train accuracy')
plt.plot(test_acc, 'k-', label='test accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title('Train and Test Accuracy')
plt.legend(loc='best')
plt.show()