In [166]:
import random
import sys
import matplotlib.pyplot as plt
from numpy.core.numeric import NaN
import pylab
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt

This notebook shows the prediction of clusters for rows with missing values.
600 of 2000 rows contain missing values up to 5 dimensions. (70% safe rows / 30% unsafe rows)


In [167]:
MISSING_VALUE = 1000000
CLUSTER_NUMBER = 4
GAMMA = 10

# preprocess data
with open('wifi_localization.txt') as f:
    file = f.readlines()
data = []
for row in file:
    data.append(([int(item) for item in row.split("\t")[:-1]],
                 int(row.split("\t")[-1].split("\n")[0])-1))
random.shuffle(data)
points = np.array([item[0] for item in data])



In [168]:
def delete_specific_values_in_one_dimension(points, column, number_of_deleted_values):
    for i in range(number_of_deleted_values):
        position_of_array_deleted_in_array = random.randint(0, len(points) - 1)
        position_of_value_deleted_in_array = column
        points[position_of_array_deleted_in_array][position_of_value_deleted_in_array] = MISSING_VALUE
    return points


def delete_values_in_data(points, number_of_deleted_values):
    for i in range(number_of_deleted_values):
        position_of_array_deleted_in_array = random.randint(0, len(points) - 1)
        position_of_value_deleted_in_array = random.randint(
        0, len(points[position_of_array_deleted_in_array]) - 1)
        points[position_of_array_deleted_in_array][position_of_value_deleted_in_array] = MISSING_VALUE
    return points

def delete_random_values_in_data(points):
    for i in range(len(points)):
        position_of_value_deleted_in_array = random.randint(
        0, len(points[i]) - 1)
        points[i][position_of_value_deleted_in_array] = MISSING_VALUE
    return points

## this function can be configured to change the number of delete dimensions (standard is deleting up to five dimensions)
def delete_up_to_five_dimensions_in_data(points):
    for i in range(len(points)):
        position_of_value_deleted_in_array = random.randint(
        0, len(points[i]) - 1)
        points[i][position_of_value_deleted_in_array] = MISSING_VALUE

        position_of_value_deleted_in_array = random.randint(
        0, len(points[i]) - 1)
        points[i][position_of_value_deleted_in_array] = MISSING_VALUE

        position_of_value_deleted_in_array = random.randint(
        0, len(points[i]) - 1)
        points[i][position_of_value_deleted_in_array] = MISSING_VALUE

        position_of_value_deleted_in_array = random.randint(
        0, len(points[i]) - 1)
        points[i][position_of_value_deleted_in_array] = MISSING_VALUE

        position_of_value_deleted_in_array = random.randint(
        0, len(points[i]) - 1)
        points[i][position_of_value_deleted_in_array] = MISSING_VALUE
    return points

In [169]:
# delete all data rows with incomplete data from the hole dataset
# data: numpy.array
# returns a numpy.array only with complete data rows
def split_data_in_completes(data):
    # select only rows with complete data and save it in completes
    completes = np.array(data)
    row = 0
    while row < len(completes):
        column = 0
        while column < len(completes[row]):
            if completes[row][column] == MISSING_VALUE:
                completes = np.delete(
                    completes, row, 0)
                row -= 1
            column += 1
        row += 1
    return completes


In [170]:
# delete all data rows with complete data from the hole dataset
# data: numpy.array
# returns a numpy.array only with incomplete data rows
def split_data_in_incompletes(data):
    incompletes = np.array(data)
    row = 0
    while row < len(incompletes):
        column = 0
        delete = True
        # Search for missing value
        while column < len(incompletes[row]):
            if incompletes[row][column] == MISSING_VALUE:
                delete = False
            column += 1
        # If there was a missing value found, delete the row
        if delete:
            incompletes = np.delete(
                incompletes, row, 0)
            row -= 1
        row += 1
    return incompletes

In [171]:
# calculates the means subspace of safe dimensions in a incomplete data row
# row: one row in a numpy.array
# gmm_means: all means of the gmm
# returns a numpy.array with the length of the safe dimensions and their means from the gmm
def calculate_subspace_means_safe_dimensions(row, gmm_means):
    cluster = 0
    while cluster < CLUSTER_NUMBER:
        index_unsafe_dimensions = [index for index in range(len(row)) if MISSING_VALUE == row[index]]
        subspace_means_safe_dimensions = np.delete(gmm_means[cluster],index_unsafe_dimensions, axis=0)
        cluster += 1
    return subspace_means_safe_dimensions


In [172]:
# calculates the covariances subspace of safe dimensions in a incomplete data row
# row: one row in a numpy.array
# gmm_covariances: all covariances of the gmm
# returns a numpy.array with the length of the safe dimensions and their covariances from the gmm
def calculate_subspace_covariances_safe_dimensions(row, gmm_covariances):
    cluster = 0
    while cluster < CLUSTER_NUMBER:
        index_unsafe_dimensions = [index for index in range(len(row)) if MISSING_VALUE == row[index]]
        subspace_covariances_safe_dimensions = np.delete(np.diag(gmm_covariances[cluster]), index_unsafe_dimensions, axis=1)
        subspace_covariances_safe_dimensions = np.delete(subspace_covariances_safe_dimensions, index_unsafe_dimensions, axis=0)
        cluster += 1
    return subspace_covariances_safe_dimensions


In [173]:
# calculates the means subspace of unsafe dimensions in a incomplete data row
# row: one row in a numpy.array
# gmm_means: all means of the gmm
# returns a numpy.array with the length of the unsafe dimensions and their means from the gmm
def calculate_subspace_means_unsafe_dimensions(row, gmm_means):
    cluster = 0
    while cluster < CLUSTER_NUMBER:
        index_safe_dimensions = [index for index in range(len(row)) if MISSING_VALUE != row[index]]
        # neuer Erwartungswert
        subspace_means_unsafe_dimensions = np.delete(gmm_means[cluster], index_safe_dimensions,axis=0)
        cluster += 1
    return subspace_means_unsafe_dimensions


In [174]:
# calculates the covariances subspace of unsafe dimensions in a incomplete data row
# row: one row in a numpy.array
# gmm_covariances: all covariances of the gmm
# returns a numpy.array with the length of the unsafe dimensions and their covariances from the gmm
def calculate_subspace_covariances_unsafe_dimensions(row, gmm_covariances):
    cluster = 0
    while cluster < CLUSTER_NUMBER:
        index_safe_dimensions = [index for index in range(len(row)) if MISSING_VALUE != row[index]]
        # neue Kovarianzmatrix
        subspace_covariances_unsafe_dimensions = np.delete(np.diag(gmm_covariances[cluster]), index_safe_dimensions, axis=1)
        subspace_covariances_unsafe_dimensions = np.delete(subspace_covariances_unsafe_dimensions, index_safe_dimensions, axis=0)
        cluster += 1
    return subspace_covariances_unsafe_dimensions


In [175]:
# calculates the new means for the new gmm (new_means is a list of all subspace_means_unsafe_dimensions)
# row: one row in a numpy.array
# gmm_means: all means of the gmm
# returns a list of numpy.arrays with the length of the safe dimensions and their means from the gmm
def calculate_new_means(row, gmm_means):
    cluster = 0
    new_means = np.zeros((1,np.count_nonzero(row == MISSING_VALUE)))
    while cluster < CLUSTER_NUMBER:
        index_safe_dimensions = [index for index in range(len(row)) if MISSING_VALUE != row[index]]
        # new mean
        subspace_means_unsafe_dimensions = np.delete(gmm_means[cluster], index_safe_dimensions,axis=0)
        new_means = np.append(new_means, [subspace_means_unsafe_dimensions], axis=0)
        cluster += 1
    new_means = np.delete(new_means,0, axis=0)
    return new_means


In [176]:
# calculates the new covariances for the new gmm (new_covariances is a list of all subspace_covariances_unsafe_dimensions)
# row: one row in a numpy.array
# gmm_covariances: all means of the gmm
# returns a list of numpy.arrays with the length of the safe dimensions and their covariances from the gmm
def calculate_new_covariances(row, gmm_covariances):
    cluster = 0
    new_covariances = np.zeros((1,np.count_nonzero(row == MISSING_VALUE),np.count_nonzero(row == MISSING_VALUE)))
    while cluster < CLUSTER_NUMBER:
        index_safe_dimensions = [index for index in range(len(row)) if MISSING_VALUE != row[index]]
        # new covariances
        subspace_covariances_unsafe_dimensions = np.delete(np.diag(gmm_covariances[cluster]), index_safe_dimensions, axis=1)
        subspace_covariances_unsafe_dimensions = np.delete(subspace_covariances_unsafe_dimensions, index_safe_dimensions, axis=0)
        new_covariances = np.append(new_covariances, [subspace_covariances_unsafe_dimensions], axis=0)
        cluster += 1
    new_covariances = np.delete(new_covariances,0, axis=0)
    return new_covariances


In [177]:
# get the values of the safe dimensions
# row: one row in a numpy.array
# returns numpy.array with all safe dimensions
def get_safe_dimensions(row):
    cluster = 0
    while cluster < CLUSTER_NUMBER:
        safe_dimensions = [value for value in row if MISSING_VALUE != value]
        safe_dimensions = np.array(safe_dimensions)
        cluster += 1
    return safe_dimensions


In [178]:
# calculate c of a row with gamma
# row: one row in a numpy.array
# gmm_means: means of the gmm
# covariances: covariances of the gmm
# returns a list of all c's (for example if the gmm contains five clusters the array contains five values) 
def calculate_c_with_gamma(row, gmm_means, gmm_covariances):
    cluster = 0
    c_list_gamma = np.zeros([CLUSTER_NUMBER])
    while cluster < CLUSTER_NUMBER:
        safe_dimensions = [value for value in row if MISSING_VALUE != value]
        safe_dimensions = np.array(safe_dimensions)

        index_unsafe_dimensions = [index for index in range(len(row)) if MISSING_VALUE == row[index]]
        subspace_means_safe_dimensions = np.delete(gmm_means[cluster],index_unsafe_dimensions, axis=0)

        index_unsafe_dimensions = [index for index in range(len(row)) if MISSING_VALUE == row[index]]
        subspace_covariances_safe_dimensions = np.delete(np.diag(gmm_covariances[cluster]), index_unsafe_dimensions, axis=1)
        subspace_covariances_safe_dimensions = np.delete(subspace_covariances_safe_dimensions, index_unsafe_dimensions, axis=0)
        gamma_matrix = np.zeros((len(safe_dimensions),len(safe_dimensions)))
        i = 0
        while i < len(safe_dimensions):
            gamma_matrix[i][i] = GAMMA
            i+= 1
        subspace_covariances_safe_dimensions_with_gamma = gamma_matrix + subspace_covariances_safe_dimensions
        c_i_gamma = multivariate_normal.pdf(safe_dimensions, mean=subspace_means_safe_dimensions, cov=subspace_covariances_safe_dimensions_with_gamma)
        c_list_gamma = np.append(c_list_gamma,c_i_gamma)
        cluster += 1
    c_list_gamma = c_list_gamma[CLUSTER_NUMBER:]
    return c_list_gamma


In [179]:
# calculate c of a row
# row: one row in a numpy.array
# gmm_means: means of the gmm
# covariances: covariances of the gmm
# returns a list of all c's (for example if the gmm contains five clusters the array contains five values) 
def calculate_c(row, gmm_means, gmm_covariances):
    cluster = 0
    c_list = np.zeros([CLUSTER_NUMBER])
    while cluster < CLUSTER_NUMBER:
        safe_dimensions = [value for value in row if MISSING_VALUE != value]
        safe_dimensions = np.array(safe_dimensions)

        index_unsafe_dimensions = [index for index in range(len(row)) if MISSING_VALUE == row[index]]
        subspace_means_safe_dimensions = np.delete(gmm_means[cluster],index_unsafe_dimensions, axis=0)

        index_unsafe_dimensions = [index for index in range(len(row)) if MISSING_VALUE == row[index]]
        subspace_covariances_safe_dimensions = np.delete(np.diag(gmm_covariances[cluster]), index_unsafe_dimensions, axis=1)
        subspace_covariances_safe_dimensions = np.delete(subspace_covariances_safe_dimensions, index_unsafe_dimensions, axis=0)
    
        c_i = multivariate_normal.pdf(safe_dimensions, mean=subspace_means_safe_dimensions, cov=subspace_covariances_safe_dimensions)
        c_list = np.append(c_list,c_i)
        cluster += 1
    c_list = c_list[CLUSTER_NUMBER:]
    
    return c_list

In [180]:
# calculate the new weights for the gmm (r in the paper)
# c_list_gamma: is a list with all c's (c in the paper)
# gmm_weights: weights of the gmm
# returns new weights for the gmm as numpy.array
def calculate_new_weights_gamma(c_list_gamma, gmm_weights):
    multiplicated_list = [c_list_gamma*gmm_weights for c_list_gamma,gmm_weights in zip(c_list_gamma,gmm_weights)]
    sum_multiplicated_list = sum(multiplicated_list)

    # normalize known dimensions
    cluster = 0
    new_weights_gamma = np.zeros([CLUSTER_NUMBER])

    while cluster < CLUSTER_NUMBER:
        r_i = ((c_list_gamma[cluster]*gmm_weights[cluster]) / sum_multiplicated_list)
        new_weights_gamma = np.append(new_weights_gamma,r_i)
        cluster += 1
    # new weights
    new_weights_gamma = new_weights_gamma[CLUSTER_NUMBER:]
    return new_weights_gamma


In [181]:
# calculate the new weights for the gmm (r in the paper)
# c_list: is a list with all c's (c in the paper)
# gmm_weights: weights of the gmm
# returns new weights for the gmm as numpy.array
def calculate_new_weights(c_list, gmm_weights):
    multiplicated_list = [c_list*gmm_weights for c_list,gmm_weights in zip(c_list,gmm_weights)]
    sum_multiplicated_list = sum(multiplicated_list)

    # normalize known dimensions
    cluster = 0
    new_weights = np.zeros([CLUSTER_NUMBER])
    while cluster < CLUSTER_NUMBER:
        r_i = ((c_list[cluster]*gmm_weights[cluster]) / sum_multiplicated_list)
        new_weights = np.append(new_weights,r_i)
        cluster += 1
    # new weights
    new_weights = new_weights[CLUSTER_NUMBER:]
    return new_weights

In [182]:
# main function
# points: numpy.array with missing_values
# returns a list of tupels which has the length of all incomplete data rows (when points has 25 rows with missing values the list of the tupels has the length of 25)
# one tupel contains the new weights, new covariances and new means for the missing dimensions
def gmm_distribution_missing_values(points):
    means_covariances_weights_list = []
    means_covariances_weights_list_gamma = []
    new_weights_list = []
    new_weights_gamma_list = []
    # delete random values 
    
    # delete a certain number of values
    #points = delete_values_in_data(points, 100)

    # delete values in one dimension
    #points = delete_specific_values_in_one_dimension(points, 3, 20)

    completes = split_data_in_completes(points)

    gmm = GaussianMixture(covariance_type="diag", n_components=CLUSTER_NUMBER, random_state = 3).fit(
        completes[:1399])
    gmm_means = np.array(gmm.means_)
    gmm_covariances = np.array(gmm.covariances_)
    gmm_weights = np.array(gmm.weights_)
    
    # initial gmm predicts the clusters with complete values
    pred_cluster = gmm.predict(points[1400:])
    # delete values in 600 rows
    points_incompletes = delete_up_to_five_dimensions_in_data(points[1400:])
    print("Datensatz, der gelöschte Werte beinhaltet: ",points_incompletes)

    incompletes = split_data_in_incompletes(points)

    for row in points_incompletes:
        subspace_means_safe_dimensions = calculate_subspace_means_safe_dimensions(row, gmm_means)
        subspace_covariances_safe_dimensions = calculate_subspace_covariances_safe_dimensions(row, gmm_covariances)
        subspace_means_unsafe_dimensions = calculate_subspace_means_unsafe_dimensions(row, gmm_means)
        subspace_covariances_unsafe_dimensions = calculate_subspace_covariances_unsafe_dimensions(row, gmm_covariances)
        new_means = calculate_new_means(row, gmm_means)
        new_covariances = calculate_new_covariances(row, gmm_covariances)
        safe_dimensions = get_safe_dimensions(row)
        c_list_gamma = calculate_c_with_gamma(row, gmm_means, gmm_covariances)
        c_list = calculate_c(row, gmm_means, gmm_covariances)
        new_weights_gamma = calculate_new_weights_gamma(c_list_gamma,gmm_weights)
        new_weights = calculate_new_weights(c_list,gmm_weights)
        new_weights_list.append(new_weights)
        new_weights_gamma_list.append(new_weights_gamma)

    return new_weights_list, pred_cluster, new_weights_gamma_list


In [183]:
# implemented method predicts the clusters of the missing rows
def pred_cluster_new_gmm(new_weights, pred_cluster):
    new_list = []
    pred_cluster_list = []
    i = 0
    while i < 600:
        max_index = new_weights[i].argmax(axis=0)
        pred_cluster_list.append(max_index)
        i += 1

    new_list = [pred_cluster_list_i - pred_cluster_i for pred_cluster_list_i, pred_cluster_i in zip(pred_cluster_list, pred_cluster)]
    percent_of_precited = round(new_list.count(0) / len(new_list) * 100,2)
    print("Clusterzuordnung ohne Gamma")
    print("Von ",len(new_list), " Reihen, die unvollständig sind, wurden ",new_list.count(0), " in das richtige Cluster zugeordnet. Das entspricht ",percent_of_precited, "%.")

In [184]:
# implemented method predicts the clusters of the missing rows with gamma
def pred_cluster_new_gmm_gamma(new_weights_gamma, pred_cluster):
    new_list = []
    pred_cluster_list = []
    
    i = 0
    while i < 600:
        max_index = new_weights_gamma[i].argmax(axis=0)
        pred_cluster_list.append(max_index)
        i += 1

    new_list = [pred_cluster_list_i - pred_cluster_i for pred_cluster_list_i, pred_cluster_i in zip(pred_cluster_list, pred_cluster)]
    percent_of_precited = round(new_list.count(0) / len(new_list) * 100,2)
    print("Clusterzuordnung mit Gamma")
    print("Von ",len(new_list), " Reihen, die unvollständig sind, wurden ",new_list.count(0), " in das richtige Cluster zugeordnet. Das entspricht ",percent_of_precited, "%.")

In [185]:
new_weights, pred_cluster, new_weights_gamma = gmm_distribution_missing_values(points)
pred_cluster_new_gmm(new_weights, pred_cluster)
pred_cluster_new_gmm_gamma(new_weights_gamma, pred_cluster)

Datensatz, der gelöschte Werte beinhaltet:  [[1000000 1000000     -65 ...     -77     -89     -90]
 [    -42 1000000 1000000 ... 1000000     -77     -73]
 [1000000 1000000 1000000 ...     -65     -79 1000000]
 ...
 [1000000 1000000     -65 ...     -68     -80     -83]
 [    -69     -58 1000000 ...     -73     -81 1000000]
 [1000000 1000000 1000000 ...     -43     -89 1000000]]
Clusterzuordnung ohne Gamma
Von  600  Reihen, die unvollständig sind, wurden  519  in das richtige Cluster zugeordnet. Das entspricht  86.5 %.
Clusterzuordnung mit Gamma
Von  600  Reihen, die unvollständig sind, wurden  515  in das richtige Cluster zugeordnet. Das entspricht  85.83 %.
