In [1]:
import pandas as pd
import numpy as np
np.random.seed(132)
from functools import lru_cache

from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_wine
from scipy.optimize import minimize
from tqdm import tqdm
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
dataset = load_wine()
df = pd.DataFrame(dataset['data'])
target = dataset['target']
df = (df - df.mean())/(df.max() - df.min())
df0 = df.copy()
print(df.shape)
df.head()

(178, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.323522,-0.123784,0.033948,-0.20077,0.296287,0.174099,0.217454,-0.154441,0.220537,0.049651,0.067114,0.479236,0.226895
1,0.052469,-0.10995,-0.121132,-0.427574,0.002809,0.122375,0.154163,-0.192177,-0.098075,-0.057857,0.075244,0.28876,0.216196
2,0.041943,0.004674,0.16229,-0.046131,0.013679,0.174099,0.255428,-0.116706,0.384574,0.053064,0.058984,0.204511,0.312487
3,0.360364,-0.076353,0.071381,-0.138915,0.144113,0.536168,0.308171,-0.229913,0.185836,0.233951,-0.079227,0.307075,0.522901
4,0.062995,0.050129,0.269242,0.07758,0.198461,0.174099,0.139395,0.053106,0.072272,-0.062977,0.067114,0.116599,-0.008483


In [4]:
num_nan_cols = 3
nan_fraction = 0.10
nan_cols = np.random.random_integers(0, df.shape[1] - 1, num_nan_cols)
# print(df.isnull().mean())
for col in set(nan_cols):
    df.loc[df.sample(int(nan_fraction * len(df))).index, col] = np.nan
# print(df.isnull().mean())
# print(df.isnull().mean())

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
nan_coords = np.array(np.where(df.isnull().values)).T
print('Num nan places: {}'.format(nan_coords.shape[0]))

Num nan places: 85


Считаем по каждой паре i_1 i_2 из coords градиент

In [6]:
from numpy.linalg import norm

def ex(arr, j, i):
    return np.exp(-norm(arr[i] - arr[j])**2)

def p(arr, j, i):
    a = ex(arr, j, i)
    b = sum(ex(arr, k, i) for k in range(len(arr)) if k!=i)
    return a / b

def d(arr, i, j, i2):
    return np.abs(arr[i, i2] - arr[j, i2])

In [7]:
df1 = df.loc[:, df.isnull().sum() == 0]
df2 = df.fillna(df.mean())
print(df1.shape, df2.shape)

(178, 8) (178, 13)


In [8]:
arr0 = df0.values
arr1 = df1.values
arr2 = df2.values
print(arr0.shape, arr1.shape, arr2.shape)
print(np.isnan(arr0).sum(), np.isnan(arr1).sum(), np.isnan(arr2).sum())

(178, 13) (178, 8) (178, 13)
0 0 0


In [9]:
def norm1(i, j):
    return norm(arr1[i] - arr1[j])**2

In [10]:
def cost(arr1, arr2):
    @lru_cache(maxsize=None)
    def norm1(i, j):
        return norm(arr1[i] - arr1[j])**2
    @lru_cache(maxsize=None)
    def ex1(i, j):
        return np.exp(-norm1(i, j))
    @lru_cache(maxsize=None)
    def p1(j, i):
        a = ex1(j, i)
        b = sum(ex1(k, i) for k in range(len(arr1)) if k!=i)
        return a / b
    @lru_cache(maxsize=None)
    def norm2(i, j):
        return norm(arr2[i] - arr2[j])**2
    @lru_cache(maxsize=None)
    def ex2(i, j):
        return np.exp(-norm2(i, j))
    @lru_cache(maxsize=None)
    def p2(j, i):
        a = ex2(j, i)
        b = sum(ex2(k, i) for k in range(len(arr2)) if k!=i)
        return a / b
    s = 0
    for i in range(len(arr1)):
        for j in range(len(arr1)):
            s += p1(j, i) * np.log(p1(j, i) / p2(j, i))
    return s

In [11]:
c = cost(arr1, arr2)
c

3.1746806447724749

In [13]:
def get_grad(arr1, arr2, i1, i2):
    '''
    arr1 - массив без пропусков(укороченный)
    arr2 - массив с прочерками(удлиенный)
    i1, i2 -  координаты nan
    '''
    @lru_cache(maxsize=None)
    def norm1(i, j):
        return norm(arr1[i] - arr1[j])

    @lru_cache(maxsize=None)
    def ex1(i, j):
        return np.exp(-norm1(i, j))

    @lru_cache(maxsize=None)
    def p1(j, i):
        a = ex1(j, i)
        b = sum(ex1(k, i) for k in range(len(arr1)) if k!=i)
        return a / b

    @lru_cache(maxsize=None)
    def norm2(i, j):
        return norm(arr2[i] - arr2[j])
    @lru_cache(maxsize=None)
    def ex2(i, j):
        return np.exp(-norm2(i, j))
    @lru_cache(maxsize=None)
    def p2(j, i):
        a = ex2(j, i)
        b = sum(ex2(k, i) for k in range(len(arr2)) if k!=i)
        return a / b
    
    @lru_cache(maxsize=None)
    def d(i, j):
        '''
        "Дистанция после дифференцирования" - то же самое, только arr == arr2 и i2 == i2
        '''
        a = np.abs(arr2[i, i2] - arr2[j, i2])
        return a
    def get_i_part(i):
        '''
        считаем i часть суммы
        '''
        s = 0
        s += p1(i1, i) + p1(i, i1)
        s -= p2(i1, i)*(1 + p1(i, i))
        s -= p2(i, i1)*(1 + p1(i1, i1))
        return s * d(i, i1)
    return sum(get_i_part(i) for i in range(len(arr1)) if i!=i1)

def get_full_grad(arr1, arr2, nan_coords):
    '''
    arr1 - массив без пропусков(укороченный)
    arr2 - массив с прочерками(удлиенный)
    i1, i2 -  координаты nan
    '''
    grads = []
    for i1, i2 in nan_coords:
        grads.append(get_grad(arr1, arr2, i1, i2))
    return np.array(grads)

In [14]:
grad = get_full_grad(arr1, arr2, nan_coords)

In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

def get_mae(arr1, arr2, nan_coords):
    vec1 = []
    vec2 = []
    for j, (x,y) in enumerate(nan_coords):
        vec1.append(arr1[x, y])
        vec2.append(arr2[x, y])
    return mean_absolute_error(vec1, vec2)

def get_msqe(arr1, arr2, nan_coords):
    vec1 = []
    vec2 = []
    for j, (x,y) in enumerate(nan_coords):
        vec1.append(arr1[x, y])
        vec2.append(arr2[x, y])
    return mean_squared_error(vec1, vec2)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
def get_acc(arr2, target):
#     df_acc = pd.DataFrame(arr2)
#     df_acc['target'] = target
    forest = RandomForestClassifier()
    return cross_val_score(forest, arr2, target, scoring='accuracy', cv=7).mean()

In [14]:
# test
# def minimize_f(alpha):
#     arr_tmp = arr2.copy()
#     for j, (x,y) in enumerate(nan_coords):
#         arr_tmp[x, y] = arr2[x, y] - alpha*grad[j]
#     c = cost(arr1, arr_tmp)
#     print(alpha, c)
#     return c

# res = minimize(minimize_f, 0.1, method='Nelder-Mead', tol=1e-4, options={'maxiter': 10})
# alpha = res.final_simplex[0][0][0]
# for j, (x,y) in enumerate(nan_coords):
#     arr2[x, y] = arr2[x, y] - alpha*grad[j]

# Iterative process

In [41]:
c = cost(arr1, arr2)
c

In [42]:
mae = get_mae(arr0, arr2, nan_coords)
mae

0.1555597687585078

In [43]:
cs = []
maes = []
msqes = []
accs = []

mae = get_mae(arr0, arr2, nan_coords)
msqe = get_msqe(arr0, arr2, nan_coords)
acc = get_acc(arr2, target)

cs.append(c)
maes.append(mae)
msqes.append(msqe)
accs.append(acc)

In [44]:
max_iter = 30
print('Iteration {} // best c {:.6} // c {:.6} // mae {:.6} // msqe {:.6} // acc {:.6}'.format(0, c, c, mae, msqe, acc))
for i in range(max_iter):
    grad = get_full_grad(arr1, arr2, nan_coords)
    def minimize_f(alpha):
        arr_tmp = arr2.copy()
        for j, (x,y) in enumerate(nan_coords):
            arr_tmp[x, y] = arr2[x, y] - alpha*grad[j]
        c = cost(arr1, arr_tmp)
        print(alpha, c)
        return c

    res = minimize(minimize_f, 0.3, method='TNC', tol=1e-4, options={'maxiter': 10}, bounds=[(0, 4)])
    alpha = res.x[0]
    for j, (x,y) in enumerate(nan_coords):
        arr2[x, y] = arr2[x, y] - alpha*grad[j]
    c = cost(arr1, arr2)
    mae = get_mae(arr0, arr2, nan_coords)
    msqe = get_msqe(arr0, arr2, nan_coords)
    acc = get_acc(arr2, target)
    accs.append(acc)
    cs.append(c)
    maes.append(mae)
    msqes.append(msqe)
    print('Iteration {} // best c {:.6} // c {:.6} // mae {:.6} // msqe {:.6} // acc {:.6}'.format(i+1, min(cs), 
                                                                                                   c, mae, msqe, acc))

Iteration 0 // best c 3.17272 // c 3.17272 // mae 0.15556 // msqe 0.0357938 // acc 0.939748
[ 0.3] 3.17269203741
[ 0.3] 3.17269203741
[ 0.30000001] 3.17269203742
[ 0.29999992] 3.1726920374
[ 0.29999992] 3.1726920374
[ 0.29999993] 3.17269203741
[ 0.] 3.17271939717
[ 0.] 3.17271939717
[  1.00000000e-08] 3.17271939716
[ 0.15] 3.17269003041
[ 0.15] 3.17269003041
[ 0.15000001] 3.17269003041
[ 0.225] 3.17268711716
[ 0.225] 3.17268711716
[ 0.22500001] 3.17268711716
[ 0.225] 3.17268711716
[ 0.225] 3.17268711716
[ 0.22500001] 3.17268711716
Iteration 1 // best c 3.17269 // c 3.17269 // mae 0.15525 // msqe 0.035697 // acc 0.934644
[ 0.3] 3.17276045472
[ 0.3] 3.17276045472
[ 0.30000001] 3.17276045473
[ 0.29999992] 3.17276045468
[ 0.29999992] 3.17276045468
[ 0.29999993] 3.17276045469
[ 0.] 3.17268711716
[ 0.] 3.17268711716
[  1.00000000e-08] 3.17268711716
[ 0.] 3.17268711716
[ 0.] 3.17268711716
[  1.00000000e-08] 3.17268711716
Iteration 2 // best c 3.17269 // c 3.17269 // mae 0.15525 // msqe 0.0356

KeyboardInterrupt: 

In [None]:
plot(фссы)