In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from time import time
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.animation as animation
import math

In [2]:
%matplotlib qt

In [3]:
def normalize(data):
    mean = np.mean(data)
    std = np.std(data)
    data = data -mean
    data = data/std
    return data

In [4]:
def inputData():
    dfX = pd.read_csv("./data/q4/q4x.dat",sep="\s+",usecols=[0,1],names=['X1','X2'])
    # normalizing the data
    X1 = np.array(normalize(dfX["X1"])).reshape(-1,1)
    X2 = np.array(normalize(dfX["X2"])).reshape(-1,1)   
    dfY = pd.read_csv("./data/q4/q4y.dat",sep="\s+",usecols=[0],names=['Y'])
    
    # Alaska is represented 0 canada as 1
    
    Y = np.array([0 if i=="Alaska" else 1 for i in dfY["Y"]]).reshape(-1,1)
    #joining the training example as one numpy Narray
    X1X2Y = np.concatenate([X1,X2,Y],axis =1).reshape(-1,3)
    return X1X2Y

In [5]:
def plotData():
    X1X2Y=inputData()
    
    #plotting the alaska data
    A1 = (X1X2Y[np.where(X1X2Y[:,2]==0)])[:,0]
    A2 = (X1X2Y[np.where(X1X2Y[:,2]==0)])[:,1]
    a,=plt.plot(A1,A2,"bX",label="Alaska")
    
    #plotting the canada data
    C1 = (X1X2Y[np.where(X1X2Y[:,2]==1)])[:,0]
    C2 = (X1X2Y[np.where(X1X2Y[:,2]==1)])[:,1]
    b,=plt.plot(C1,C2,"ro",label="Canada")
    
    #labelling the axis
    plt.xlabel("X1 feature",color="r")
    plt.ylabel("X2 feature",color="r")
    plt.title("Data distribution")
    plt.legend()
    plt.show(block = False)
    return a,b

In [6]:
def cal_MU():
    X1X2Y=inputData()
    count0 = np.count_nonzero(X1X2Y[:,2] == 0)
    count1 = np.count_nonzero(X1X2Y[:,2] == 1)
    MU0 = []
    MU1 = []
    MU0.append(np.sum(X1X2Y[:,0]*(1-X1X2Y[:,2]))/count0)
    MU0.append(np.sum(X1X2Y[:,1]*(1-X1X2Y[:,2]))/count0)
    MU1.append(np.sum(X1X2Y[:,0]*X1X2Y[:,2])/count1)
    MU1.append(np.sum(X1X2Y[:,1]*X1X2Y[:,2])/count1)
    return np.array(MU0).reshape(-1,1),np.array(MU1).reshape(-1,1)

In [7]:
def phi():
    X1X2Y=inputData()
    count1 = np.count_nonzero(X1X2Y[:,2] == 1)
    return count1/X1X2Y.shape[0]


In [8]:
def covariance():
    MU0,MU1 = cal_MU()
    X1X2Y = inputData()
    X1 = np.array([i[0]-MU0[0,:] if i[2]==0 else i[0]-MU1[0,:] for i in X1X2Y])
    X2 = np.array([i[1]-MU0[1,:] if i[2]==0 else i[1]-MU1[1,:] for i in X1X2Y])
    X=np.concatenate([X1,X2],axis=1)
    return np.dot(X.T,X)/X.shape[0]
    

In [9]:
def find_X2_point(x1):
    phiValue = phi()
    (MU0,MU1) = cal_MU()
    COV = covariance()
#     ax1+bx2+c =0
    c = -(np.dot(np.dot(MU0.T,np.linalg.inv(COV)),MU0) - np.dot(np.dot(MU1.T,np.linalg.inv(COV)),MU1))+ math.log(phiValue/(1-phiValue))
    ab =  np.dot(MU1.T,np.linalg.inv(COV)) - np.dot(MU0.T,np.linalg.inv(COV))
    a = ab[:,0]
    b = ab[:,1]
    x2 = -(c +a*x1)/b
    return x2

In [10]:
def LinearBoundry():
    fig = plt.figure()
    ax = fig.add_subplot(111)
    X1X2Y=inputData()
    
    #plotting the alaska data
    A1 = (X1X2Y[np.where(X1X2Y[:,2]==0)])[:,0]
    A2 = (X1X2Y[np.where(X1X2Y[:,2]==0)])[:,1]
    a,=plt.plot(A1,A2,"bX",label="Alaska")
    
    #plotting the canada data
    C1 = (X1X2Y[np.where(X1X2Y[:,2]==1)])[:,0]
    C2 = (X1X2Y[np.where(X1X2Y[:,2]==1)])[:,1]
    b,=plt.plot(C1,C2,"ro",label="Canada")
    
    #plotting hypothesis

    c, = plt.plot(X1X2Y[:,0],np.array([find_X2_point(i[0]) for i in X1X2Y]).reshape(-1,1),"g",label = "Decision Boundary")
    
    #labelling the axis
    plt.xlabel("X1 feature",color="r")
    plt.ylabel("X2 feature",color="r")
    plt.title("GDA Classification")
    plt.legend()
    plt.show(block = False)
    return a,b,c

In [11]:
(c,d)=plotData()
(a,b,c)=LinearBoundry()