In [1]:
import numpy as np
import csv
import sys
def import_data_and_weights(test_X_file_path, weights_file_path):
    test_X = np.genfromtxt(test_X_file_path, delimiter=',', dtype=np.float64, skip_header=1)
    weights = np.genfromtxt(weights_file_path, delimiter=',', dtype=np.float64)
    return test_X, weights

def sigmoid(Z):
    s = 1 / (1 + np.exp(-Z))
    return s

def apply_one_hot_encoding(X):
    unique_values = list(set(X))
    unique_values.sort()
    one_hot_encoding_map = {}
    counter = 0
    for x in unique_values:
        one_hot_encoding_map[x] = [0 for i in range(len(unique_values))]
        one_hot_encoding_map[x][counter] = 1
        counter += 1
    one_hot_encoded_X = []
    for x in X:
        one_hot_encoded_X.append(one_hot_encoding_map[x])

    one_hot_encoded_X = np.array(one_hot_encoded_X, dtype=int)
    return one_hot_encoded_X

def convert_given_cols_to_one_hot(X, column_indices):
    one_hot_encoded_X = np.zeros([len(X),1])

    start_index = 0
    #acts column pointer in X

    for curr_index in column_indices:
        #adding the columns present before curr_index in X (and not present in one_hot_encoded_X), to one_hot_encoded_X
        one_hot_encoded_X=np.append(one_hot_encoded_X,X[:, start_index:curr_index], axis=1)
        
        #applying one hot encoding for current column
        one_hot_encoded_column = apply_one_hot_encoding(X[:,curr_index])

        #appending the obtained one hot encoded array to one_hot_encoded_X
        one_hot_encoded_X=np.append(one_hot_encoded_X,one_hot_encoded_column, axis=1)

        #moving the column pointer of X to next current_index
        start_index = curr_index+1

    #adding any remaining columns to one_hot_encoded_X    
    one_hot_encoded_X=np.append(one_hot_encoded_X,X[:,start_index:], axis=1)
    one_hot_encoded_X = one_hot_encoded_X[:,1:]
    return one_hot_encoded_X

def get_correlation_matrix(X):
    num_vars = len(X[0])
    m = len(X)
    correlation_matix = np.zeros((num_vars,num_vars))
    for i in range(0,num_vars):
        for j in range(i,num_vars):
            mean_i = np.mean(X[:,i])
            mean_j = np.mean(X[:,j])
            std_dev_i = np.std(X[:,i])
            std_dev_j = np.std(X[:,j])
            numerator = np.sum((X[:,i]-mean_i)*(X[:,j]-mean_j))
            denominator = (m)*(std_dev_i)*(std_dev_j)
            corr_i_j = numerator/denominator    
            correlation_matix[i][j] = corr_i_j
            correlation_matix[j][i] = corr_i_j
    return correlation_matix

def select_features(corr_mat, T1, T2):
    n=len(corr_mat)
    filtered_features = []
    for i in range(1,n):
        if (abs(corr_mat[i][0]) > T1):
            filtered_features.append(i)
    m = len(filtered_features)
    removed_features = []
    selected_features = list(filtered_features)
    for i in range(0,m):
        for j in range(i+1,m):
            f1 = filtered_features[i]
            f2 = filtered_features[j]
            if (f1 not in removed_features and f2 not in removed_features): 
                if (abs(corr_mat[f1][f2]) > T2):
                    selected_features.remove(f2)
                    removed_features.append(f2)

    return selected_features

def replace_null_values_with_mean(X):
    #Obtain mean of columns
    col_mean = np.nanmean(X, axis=0)

    #Find indicies that we need to replace
    inds = np.where(np.isnan(X))

    #Place column means in the indices. Align the arrays using take
    X[inds] = np.take(col_mean, inds[1])
    return X

def min_max_normalize(X, column_indices):
    for column_index in column_indices:
        column = X[:,column_index]
        min = np.min(column, axis=0) 
        max = np.max(column, axis=0)
        difference = max- min
        X[:,column_index] = (column - min) /difference
    return X

def compute_cost(X, Y, W, b, Lambda):
    Z = np.dot(X, W) + b
    A = sigmoid(Z)
    M=len(X)
    A[A==1]=0.99999
    A[A==0]=0.00001
    cost = (-1/M) * np.sum(Y * np.log(A) + (1-Y) * np.log(1-A)) 
    regularization_cost = (Lambda * np.sum(np.square(W))) / (2 * M) 
    return cost + regularization_cost

def compute_weights_using_normal_equation(X, Y, Lambda):
    ones = np.ones(len(X))
    X = np.insert(X, 0, ones, axis=1)
    matrix = np.identity(len(X[0]))
    matrix[0][0] = 0
    regularization_term = Lambda * np.matrix(matrix)
    weights = np.dot(np.linalg.inv(np.dot(X.T, X) + regularization_term), np.dot(X.T, Y))
    return weights

def compute_gradients_using_regularization(X, Y, W, b, Lambda):
    m = len(X)
    A = np.dot(X, W) + b
    dW = 1/m * (np.dot((A-Y).T, X) + Lambda*(W.T))
    db = 1/m * np.sum(A-Y)
    dW = dW.T
    return dW, db

def predict_labels(X, W, b):
    A = sigmoid(np.dot(X,W) + b)
    A = A.T[0]
    Y_prediction = np.where(A >= 0.5, 1, 0)
    return Y_prediction
def train_data_for_class(train_X,train_Y,class_label):
    class_X=np.copy(train_X)
    class_Y=np.copy(train_Y)
    class_Y=np.where(class_Y==class_label,1,0)
    return class_X, class_Y
def optimize_weights_using_gradient_descent(X, Y, W, learning_rate , precision,Lambda):
    prev_itr_cost=0
    itr=0
    b=0
    #print("W=",W)
    while(1):
        itr+=1
        dW,db = compute_gradients_using_regularization(X, Y, W, b,Lambda)
        #print("dw=",dW)
        W = W - (learning_rate * dW)
        b = b - (learning_rate * db)
        #print("W=new",W)
        cost=compute_cost(X, Y, W, b,Lambda)
        if(itr%10000==0):
            print("cost=",cost,itr)
        if abs(prev_itr_cost-cost)<precision:
            print(itr,cost)
            break
        prev_itr_cost=cost
        #print(i," ",cost)
    return (W,b)
def train_model(X,Y,learning_rate , precision,Lambda):
    Y=Y.reshape(len(X),1)
    W=np.zeros((X.shape[1],1))
    W,b=optimize_weights_using_gradient_descent(X, Y, W, learning_rate, precision,Lambda)
    W = np.append(W,b)
    #print(W)
    return W
def save_model(weights,weights_filename):
    with open(weights_filename,'w') as weights_file:
        wr=csv.writer(weights_file)
        wr.writerows(weights)
        weights_file.close()
if __name__ == "__main__":
    test1,test2=import_data_and_weights("train_X_pr.csv","train_Y_pr.csv")
    X,Y = test1,test2
    weights_array = []
    X,Y = train_data_for_class(test1,test2,0)
    X = replace_null_values_with_mean(X)
    X = convert_given_cols_to_one_hot(X, [0,3])
    m=(X.shape)[0]
    n=(X.shape)[1]
    col=[]
    for i in range(n):
        for j in range(m):
            if X[j][i]>1 or X[j][i]<0:
                col.append(i)
                break
    X = min_max_normalize(X, col)
    corr_mat = get_correlation_matrix(X)
    fea = select_features(corr_mat, 0, 0.7)
    print(fea,X.shape)
    count=0
    for i in range(n):
        if i not in fea:
            X=np.delete(X,i-count,axis=1)
            count+=1
    #print(X[:20])
    weights = compute_weights_using_normal_equation(X, Y, 0.1)
    weights=weights.T
    a,b=weights.shape
    c=np.zeros(a)
    for i in range(a):
        c[i] = weights[i]
    weights=c
    
    
    #weights=train_model(X,Y,0.007,0.00000001,0.1)
    weights_array.append(weights)
    
    X,Y = train_data_for_class(test1,test2,1)
    X = replace_null_values_with_mean(X)
    X = convert_given_cols_to_one_hot(X, [0,3])
    m=(X.shape)[0]
    n=(X.shape)[1]
    col=[]
    for i in range(n):
        for j in range(m):
            if X[j][i]>1 or X[j][i]<0:
                col.append(i)
                break
    X = min_max_normalize(X, col)
    corr_mat = get_correlation_matrix(X)
    fea = select_features(corr_mat, 0, 0.7)
    print(fea)
    count=0
    for i in range(n):
        if i not in fea:
            X=np.delete(X,i-count,axis=1)
            count+=1
    #print(X[:20])
    weights = compute_weights_using_normal_equation(X, Y, 0.1)
    weights=weights.T
    a,b=weights.shape
    c=np.zeros(a)
    for i in range(a):
        c[i] = weights[i]
    weights=c
    
    #weights=train_model(X,Y,0.007,0.00000001,0.1)
    weights_array.append(weights)
#     weights=weights.T
#     a,b=weights.shape
#     c=np.zeros(a)
#     for i in range(a):
#         c[i] = weights[i]
#     weights=c
    #print(weights,X.shape)
    #print(weights,c)
    save_model(weights_array,"WEIGHTS_FILE.csv")
    #save_model(weights,"WEIGHTS_FILE.csv")

[1, 2, 3, 4, 5, 7, 8, 9, 10] (712, 11)
[1, 2, 3, 4, 5, 7, 8, 9, 10]
