In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def get_data():
    df=pd.read_csv('ecommerce_data.csv')
    X=df.iloc[:,:-1].values
    y=df.iloc[:,-1].values
    X[:,1]=(X[:,1]-X[:,1].mean())/X[:,1].std()
    X[:,2]=(X[:,2]-X[:,2].mean())/X[:,2].std()
    N,D=X.shape
    # One hot encoding Categorical Variable
    X2=np.zeros((N,D+3)) # as 4 classes in categorical variable
    X2[:,:D-1]=X[:,:D-1] # copy all variables except Categorical variable
    for i in range(N):
        t=int(X[i,D-1]) # find the class of categorical variable for the index i
        X2[i,t+D-1]=1 # assign the ith row of (D-1+class of categorical variable) column as 1
    return X2,y
def get_binary_data():
    X,y=get_data()
    X2=X[y<=1]
    y2=y[y<=1]
    return X2,y2
def sigmoid(z):
    return (1/(1+np.exp(-z)))
def feed_forward(X,w,b):
    return sigmoid(X.dot(w) + b)
def classification_rate(y,y_pred):
    return np.mean(y==y_pred)
def cross_entropy(y,y_pred):
    E=0
    for i in range(len(y)):
        if y[i]==1:
            E-=np.log(y_pred[i])
        else:
            E-=np.log(1-y_pred[i])
    return E
def train_model(X,y,w,b,lr,epoch):
    cost=[]
    for i in range(epoch):
        w=w-lr*(X.T.dot((feed_forward(X,w,b)-y)))
        y_pred=feed_forward(X,w,b)
        cost.append(cross_entropy(y,y_pred))
    return y_pred,w,cost
if __name__=='__main__':
    X,y=get_binary_data()
    N,D=X.shape
    w=np.random.randn(D)
    tr_i=np.random.choice(N,int(np.round(0.7*N)))
    X_train=X[tr_i]
    y_train=y[tr_i]
    t_i=[i for i in range(N) if i not in tr_i]
    X_test=X[t_i]
    y_test=y[t_i]    
    b=0 # bias term
    p_x=feed_forward(X,w,b)
    print('Classification Rate: ',classification_rate(y,np.round(p_x)))
    p,w_n,c=train_model(X,y,w,b,0.001,1000)
    print('Classification Rate: ',classification_rate(y,np.round(p)))
    p_t,w_t,c_t=train_model(X_train,y_train,w,b,0.001,1000)
    print('Classification Rate for Test Sample: ',classification_rate(y_test,np.round(feed_forward(X_test,w_t,b))))



Classification Rate:  0.36683417085427134
Classification Rate:  0.9723618090452262
Classification Rate for Test Sample:  0.9646464646464646


In [80]:
y_test

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
      dtype=int64)

In [81]:
y_train

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)

In [3]:
df.head()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


In [4]:
df.describe()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.486,0.854,1.05588,0.518,1.588,0.748
std,0.500305,1.046362,0.976711,0.500176,1.121057,0.89336
min,0.0,0.0,0.000141,0.0,0.0,0.0
25%,0.0,0.0,0.32855,0.0,1.0,0.0
50%,0.0,1.0,0.804717,1.0,2.0,0.0
75%,1.0,1.0,1.499518,1.0,3.0,1.0
max,1.0,4.0,6.368775,1.0,3.0,3.0


In [28]:
Z=np.zeros((N,4))

In [29]:
Z

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [34]:
Z[np.arange(N),X[:,D-1].astype(np.int32)]=1

In [35]:
Z

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [37]:
(X2[:,-4:]-Z).sum()

0.0