In [None]:

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import random
%matplotlib inline


def split_dataset (dataset ,split_ratio): # numpy
    training_set = []
    training_size = int(len(dataset)*split_ratio)
    temp_test_dataset = list(dataset)
    random_row_number = 0
    random_row = []
    while (len(training_set)<training_size):
        random_row_number = random.randrange(len(temp_test_dataset))
        random_row = temp_test_dataset.pop(random_row_number)
        training_set.append(random_row)
    test_dataset = temp_test_dataset
    return [training_set, test_dataset]



def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)


def categorize_into_classes (dataset):
    list_of_classes = []
    list_of_classes = np.unique(dataset[:,-1])
    classes_features_dict = {}
    
    for i in list_of_classes:
        data_for_the_class =  dataset[dataset[:,-1]==i].copy()
        data_for_the_class = data_for_the_class[:,:-1].copy() # cutting off the last column
        classes_features_dict[i] = data_for_the_class
    return classes_features_dict




def generate_stats(dataset):
    temp_mean = 0
    std_dev = 0
    stats_array = []
    # dataset = dataset[:,:-1].copy() # removed the last column
    index = 0
    for index in range(np.size(dataset,1)):
        temp_mean = mean(dataset[:,index])
        std_dev = stdev(dataset[:,index])
        if (std_dev==0):
            std_dev = 0.0000022
        stats_array.append([temp_mean,std_dev])
        
    return stats_array

def generate_stats_by_class(dataset):

    classes_features_dict = categorize_into_classes (dataset)
    class_wise_stats = {}
    for class_value, class_data in classes_features_dict.items():
        class_wise_stats[class_value] = generate_stats(class_data)

    return class_wise_stats

def calcuate_parameters(sigma,sigma_0,u_0,u_nbar,n):
# this function has to be called for every dimension
# sigma is sample variance

    sigma_square = math.pow(sigma,2)
    sigma_0_square = math.pow(sigma_0,2)
    
    
    sigma_n_square = (sigma_square*sigma_0_square)/(sigma_square + (n*sigma_0_square))
    
    
    denominator = (sigma_square + (n*sigma_0_square))
    numerator = (n*sigma_0_square)*u_nbar + sigma_square*u_0
    
    u_n = numerator/denominator
    
    return sigma_n_square, u_n


def calc_class_cond_probability(sigma_n_square, u_n, sigma, x_i):
    
    sigma_square = math.pow(sigma,2)
    denominator =  sigma_square + sigma_n_square
    numerator = math.pow((x_i - u_n),2)
    
    log_prob = (-math.log(2*math.pi*denominator))/(2) - (numerator/denominator)
       
    
    return log_prob


def calculate_all_probabilities(class_wise_stats, sigma_0_v, u_0_v, input_vector, n): #the vectors are prior vectors
    probabilities = {}
    for class_value, class_stats in class_wise_stats.items():
        probabilities[class_value] = 0
        for i in range(len(class_stats)):
            u_nbar, sigma = class_stats[i]
            sigma_n_square, u_n = calcuate_parameters(sigma,sigma_0_v[i],u_0_v[i],u_nbar, n)
            x_i = input_vector[i]
            probabilities[class_value] +=calc_class_cond_probability(sigma_n_square, u_n, sigma, x_i)
    return probabilities
    
def predict(class_wise_stats, sigma_0_v, u_0_v, input_vector, n):
    probabilities = calculate_all_probabilities(class_wise_stats, sigma_0_v, u_0_v, input_vector, n)
    label = None
    prob = -1
    
    for class_value, probability in probabilities.items():
        if label is None or probability > prob:
            prob = probability
            label = class_value
    return label


def get_predictions(class_wise_stats,sigma_0_v, u_0_v, test_vectors):
    predictions = []
    n = len(test_vectors)
    for i in range(len(test_vectors)):
        result = predict(class_wise_stats, sigma_0_v, u_0_v, test_vectors[i], n)
        predictions.append(result)
    return predictions
    
    


def getAccuracy(test_dataset, predictions):
    correct = 0
    for i in range(len(test_dataset)):
        if test_dataset[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(test_dataset))) * 100.0

orig_dataset = pd.read_csv("RainInAustralia.csv")


new_dataset = np.array(orig_dataset)

split_ratio = 0.89

training_set, test_dataset = split_dataset(new_dataset, split_ratio)

training_set = np.array(training_set)
test_dataset = np.array(test_dataset)

# prior vectors
# sigma_0_v = [8,8,1,1,8,8,8,8,8,8,18,8,8]  #90% accurcacy
sigma_0_v = [5,1,1,1,0,8,0,8,8,8,18,8,8] # 86% accuracy
u_0_v = [5,0,4,0,0,0,10,0,0,20,0,5,0]



class_wise_stats = generate_stats_by_class(training_set)

predictions = get_predictions(class_wise_stats,sigma_0_v, u_0_v, test_vectors = test_dataset[:,:-1])

getAccuracy(test_dataset, predictions)
