In [1]:
#Fill in Missing Values

In [5]:
import os
import pandas as pd
import numpy as np

In [3]:
os.chdir("Datasets")
data = pd.read_csv("GTPvar.csv", index_col = 0)

In [4]:
data.isnull().sum(axis = 1) #checking for missing columns in each row

1      0
2      0
3      0
4      0
5      3
6      1
7      1
8      2
9      1
10     2
11     0
12     0
13     1
14     3
15     0
16     0
17     0
18     3
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     1
27     0
28     0
29     0
30     2
      ..
71     2
72     1
73     0
74     1
75     0
76     0
77     0
78     0
79     0
80     4
81     0
82     0
83     0
84     0
85     0
86     0
87     3
88     0
89     0
90     0
91     0
92     0
93     0
94     4
95     0
96     0
97     5
98     0
99     0
100    0
Length: 100, dtype: int64

In [5]:
data['NAPresent'] = data.isnull().sum(axis = 1)  #adding a column to represent NA values
data['NAPresent'].value_counts()

0    58
1    18
3    11
2    10
4     2
5     1
Name: NAPresent, dtype: int64

In [6]:
#separating full records from missing val records
df = data[data.NAPresent == 0]
df = df.drop('NAPresent', axis = 1)

In [7]:
#convert to numpy array, as we want to work with it as a matrix
df_mat = df.to_numpy()

In [8]:
np.linalg.matrix_rank(df_mat) #finding rank of matrix

5

In [9]:
#validating the rank (to check if it is correct)
v, s, u = np.linalg.svd(df_mat.T) #svd is the method to find the rank above

In [12]:
print(len(s))

5


In [13]:
print(s)
# s : eigenvalues of the matrix, it has 2 good values and 3 very low, almost 0 values, therefore rank should
#actually be 2, not 5, as they're very small

[1.96033304e+01 2.28167354e+00 2.42447585e-10 2.13787501e-10
 1.94320280e-10]


In [14]:
#setting a tolerance
tol = 1e-8

#removing columns lesser than tolerance
rank = min(df_mat.shape)-np.abs(s)[::-1].searchsorted(tol)

In [15]:
#choosing Null Space Relation
A = v[:,rank:] #choosing the last 3 of the v array, as the last 3 eigenvalues are only almost zero. Therefore
#those eigenvectors (v) belong to the null space, as per theory
A = A.T
print(A)

[[-0.07730698  0.34050231 -0.67413398  0.6508402   0.00567558]
 [ 0.48476979  0.37806431  0.51469795  0.39680398 -0.4468761 ]
 [ 0.55057871 -0.75530287 -0.13744801  0.31747114  0.0818582 ]]


In [16]:
#now, we have 3 equations
#thus, we can impute values for rows which have atmost 3 missing values
#4 or 5 missing value rows need to be discarded (here), or computed using pseudo - inverse algorithm

#Task:
#find the no. of NAs per row
#consider only as many equations(rows of A) as no. of NAs
#check which fields have NAs. Those fields are our local 'a' matrix, which hold indices of missing locations
#rest constitute the constant term when multiplied by the corresponding eigen vector
#that is A X = B format

In [17]:
len(data)

100

In [18]:
len(A)

3

In [20]:
for i in range(0, len(data)):
    if((data.iloc[i,5]==0) | (data.iloc[i,5]>len(A))):
        continue #excluding >3 missing values or 0 missing values
    else:
        eqnsneeded = data.iloc[i,5]
        aID = np.empty(0, dtype = "int64")
        bID = np.empty(0, dtype = "int64")
        for j in range(len(data.columns) - 1):
            if(pd.isnull(data.iloc[i,j])):
                aID = np.append(aID, j) #filling missing indices for 'a'
            else:
                bID = np.append(bID, j) #filling constant(known) indices for 'b'
            
        a = A[0:eqnsneeded, aID] #accessing as many co-efficients needed from A as per no. of eqns needed
        a = np.array(a)
            
        x1 = ((data.iloc[i,bID].to_numpy()))
        b2 = -A[0:eqnsneeded, bID]
        b  = np.dot(b2,x1)
        x  = np.linalg.solve(a,b) #solving ax = b
        data.iloc[i,aID] = x #filling those missing values in the respective row with the ones found from the 
            #null space relation by solving for AX = B

In [21]:
print(data)

           F1        F2        F3        F4        F5  NAPresent
1    0.266066  0.558480  0.894119  0.644930  2.363595          0
2    0.552035  0.666928  0.616492  0.336270  2.171725          0
3    0.476539  0.634680  0.680674  0.410387  2.202279          0
4    0.670965  0.804444  0.733776  0.396153  2.605338          0
5    0.480306  0.731702  0.917772  0.601049  2.730830          3
6    0.726267  0.781446  0.569351  0.246897  2.323960          1
7    0.633964  0.715634  0.581369  0.283764  2.214731          1
8    0.550642  0.698550  0.698817  0.403266  2.351275          2
9    0.478420  0.589715  0.563806  0.315308  1.947248          1
10   0.477733  0.635127  0.679502  0.409087  2.201449          2
11   0.499508  0.582892  0.506011  0.262359  1.850770          0
12   0.717392  0.814883  0.670655  0.331444  2.534374          0
13   0.337443  0.636942  0.954262  0.672582  2.601228          1
14   0.746236  0.793220  0.560543  0.233899  2.333898          3
15   0.276671  0.558349  

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 1 to 100
Data columns (total 6 columns):
F1           97 non-null float64
F2           98 non-null float64
F3           98 non-null float64
F4           97 non-null float64
F5           97 non-null float64
NAPresent    100 non-null int64
dtypes: float64(5), int64(1)
memory usage: 10.5 KB


In [1]:
#test workout

In [16]:
A = [[4,5,16,7],[2,-3,2,3],[3,4,5,6],[4,7,8,9]]
A = np.asarray(A, dtype = 'int64')
rank = np.linalg.matrix_rank(A)
print(rank)

u, s, v = np.linalg.svd(A.T)
print(u)

4
[[-0.26155586 -0.10136075  0.36374373 -0.88825955]
 [-0.35413309 -0.59986971 -0.70783255 -0.11712885]
 [-0.73737018  0.64517808 -0.18881252  0.06618379]
 [-0.51231193 -0.46219862  0.57533765  0.43919871]]


In [18]:
A = [[1,0,-1,2], [0, 3 , 1 , -1], [2, 4, 0, 3], [-3, 1, -1, 2]]
B = [[1,2], [3, -1], [0, -1], [4,2]]
C = [[3, 8, 0, 5], [1, 0, -4, 8]]
A = np.asarray(A, dtype = 'int64')
B = np.asarray(B, dtype = 'int64')
C = np.asarray(C, dtype = 'int64')

In [20]:
D = np.dot(C,A)
D = np.dot(D,B)
print(D)

[[107 -37]
 [-31 -33]]


In [24]:
Z = [[-2, 32, 24], [92, 66, 25], [-80, 37, 10]]
Z = np.asarray(Z, dtype = 'int64')
print(np.linalg.det(Z))

115506.00000000006


In [38]:
P = [[2,1,2], [1,0,1], [3,1,3]]
P = np.asarray(P, dtype = 'int64')
print(np.matrix.getH(P)/np.linalg.det(P))
print(np.linalg.det(P))

[[1.20095990e+16 6.00479950e+15 1.80143985e+16]
 [6.00479950e+15 0.00000000e+00 6.00479950e+15]
 [1.20095990e+16 6.00479950e+15 1.80143985e+16]]
1.665334536937729e-16


In [50]:
import math
A = [[1,6,1],[1,2,3],[0,0,3]]
A = np.asarray(A, dtype = 'int64')
B = A.T
C = np.linalg.inv(A)
print("B det: ",np.linalg.det(B))
print("C det: ",np.linalg.det(C))
print("Inv of C det: ",math.pow(np.linalg.det(C), -1))
w,v = np.linalg.eig(A)
print(w,v)

B det:  -12.0
C det:  -0.08333333333333333
Inv of C det:  -12.0
[-1.  4.  3.] [[-0.9486833  -0.89442719 -0.92055301]
 [ 0.31622777 -0.4472136  -0.33915111]
 [ 0.          0.          0.19380063]]


In [60]:
v = [5, -2, 3]
w = [-2, 4, 6]
v = np.asarray(v, dtype = 'int64')
w = np.asarray(w, dtype = 'int64')
print(v.dot(w.T))

0
