In [17]:
import time
import numpy as np
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from scipy.special import entr

def calcprecall(cf):
    classes = cf.shape[0]
    TP = np.diag(cf)
    FP = np.sum(cf, axis=0) - TP
    FN = np.sum(cf, axis=1) - TP

    accuracy = np.trace(cf)/(np.sum(np.sum(cf)))*100
    precision = (np.sum(TP / (TP + FP))/classes)*100
    recall = (np.sum(TP / (TP + FN))/classes)*100
    F1score = (2*precision*recall)/(precision+recall)
    
    return accuracy, precision, recall, F1score

In [24]:
alpha = 1/1.5
tdata = np.loadtxt("punjab_data.txt")

X = tdata[:,0:15]
Y = tdata[:,15]
m = X.shape[0]
n = X.shape[1]
skf = StratifiedKFold(n_splits=10)
classes = 9
CfGNB =np.zeros((classes,classes))
probs = np.array([]).reshape(0,classes)
entropy = np.array([]).reshape(0,1)
Xkf = np.array([]).reshape(0,n)
Ykf = np.array([]).reshape(0,1)
YNewkf = np.array([]).reshape(0,1)
pgnb = np.array([]).reshape(0,1)

names = ['B1','B2', 'B3', 'B4', 'B5', 'B6','B7','B8','B8A','B9','B10','B11', 'B12','VH','VV'];

for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    m_test = X_test.shape[0]
    y_test_new  = np.copy(y_test)
    
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictGNB = gnb.predict(X_test)
    p = gnb.predict_proba(X_test)
    D = np.zeros((m_test,1))
    for i in range(p.shape[0]):
        for j in range(p.shape[1]):
            if(p[i,j] >= 10**-300):
                D[i] = D[i]-p[i,j]*np.log(p[i,j])
                
    cf = confusion_matrix(y_test, predictGNB)
    CfGNB = CfGNB + cf
    probs = np.concatenate([probs,p])
    entropy = np.concatenate([entropy,D])
    
    Xkf = np.concatenate([Xkf,X_test])
    Ykf = np.concatenate([Ykf,y_test.reshape(m_test,1)])
    YNewkf = np.concatenate([YNewkf,y_test_new.reshape(m_test,1)])
    pgnb = np.concatenate([pgnb,predictGNB.reshape(m_test,1)])
    
print(CfGNB)
[acc3, pre3, rec3, f3] = calcprecall(CfGNB)
print("GNB:\n accuracy = {}, precision = {}, recall = {}, f1-score= {}".format(acc3,pre3,rec3,f3))
indexArraySorted = np.argsort(np.squeeze(entropy))
index = int(alpha*(m))
T = entropy[indexArraySorted[index]]
print(T)
for k in range(m):
    if((entropy[k]<T) & (pgnb[k] != Ykf[k])):
        #relabel
        YNewkf[k] = pgnb[k] 

print("number of labels updated: {}".format(np.sum(YNewkf!=Ykf)))

filteredIndices = np.where(entropy<T)[0]
filteredX = np.take(Xkf, filteredIndices, axis = 0)
filteredY = np.take(YNewkf,filteredIndices).reshape(filteredX.shape[0],1)
filteredData = np.concatenate([filteredX,filteredY],axis = 1)
print(filteredData.shape)
print("land: {}".format(np.sum(filteredY == 0)))
print("fallow: {}".format(np.sum(filteredY == 1)))
print("water: {}".format(np.sum(filteredY == 2)))
print("fodder: {}".format(np.sum(filteredY == 3)))
print("wheat: {}".format(np.sum(filteredY == 4)))
print("gram: {}".format(np.sum(filteredY == 5)))
print("maize: {}".format(np.sum(filteredY == 6)))
print("vegetable: {}".format(np.sum(filteredY == 7)))
print("trees: {}".format(np.sum(filteredY == 8)))

filteredData = filteredData[filteredData[:,15].argsort()]

wholeData = np.concatenate([Xkf,Ykf,YNewkf,pgnb,entropy],axis = 1)
wholeData = wholeData[wholeData[:,15].argsort()]
np.savetxt("data1.csv", wholeData, delimiter=",",fmt='%1.3f',header="B1,B2,B3,B4,B5,B6,B7,B8,B8A,B9,B10,B11,B12,VV,VH,Y,Y_new,predictions, entropy")

[[50.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0. 50.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 50.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 27.  5.  3.  6.  1.  8.]
 [ 0.  0.  0.  1. 23.  5. 10.  4.  7.]
 [ 0.  0.  0.  1.  1. 45.  2.  0.  1.]
 [ 0.  0.  0.  0.  4.  0. 35.  6.  5.]
 [ 0.  0.  0.  8. 11.  0. 13. 12.  6.]
 [ 0.  0.  0.  9. 11.  3. 19.  1.  7.]]
GNB:
 accuracy = 66.44444444444444, precision = 65.84840919239896, recall = 66.44444444444444, f1-score= 66.14508412298535
[0.26709421]
number of labels updated: 48
(300, 16)
land: 50
fallow: 50
water: 50
fodder: 29
wheat: 17
gram: 52
maize: 38
vegetable: 14
trees: 0


In [25]:
m = filteredData.shape[0]
subsets = 10
classes = 9
a = m

for i in range(1,classes):
    c = np.sum(filteredY == i)
    if(c < subsets):
        a = a - c
        classes = classes - 1
        print("{} th class rejected due to lower number of samples".format(i))


X = filteredData[0:a,0:15]
Y = filteredData[0:a,15]
skf = StratifiedKFold(n_splits=subsets)
Y = Y.reshape(X.shape[0],)
CfGNB =np.zeros(classes)
names = ['B1','B2', 'B3', 'B4', 'B5', 'B6','B7','B8','B8A','B9','B10','B11', 'B12','VH','VV'];

for train_index, test_index in skf.split(X,Y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    # training and testing with baysian
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictGNB = gnb.predict(X_test)
    cf = confusion_matrix(y_test, predictGNB)
    CfGNB = CfGNB + cf

print(CfGNB)
[acc3, pre3, rec3, f3] = calcprecall(CfGNB)
print("GNB:\n accuracy = {}, precision = {}, recall = {}, f1-score= {}".format(acc3,pre3,rec3,f3))

8 th class rejected due to lower number of samples
[[50.  0.  0.  0.  0.  0.  0.  0.]
 [ 0. 50.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 50.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 28.  0.  0.  0.  1.]
 [ 0.  0.  0.  0. 16.  0.  1.  0.]
 [ 0.  0.  0.  0.  0. 51.  0.  1.]
 [ 0.  0.  0.  1.  0.  0. 37.  0.]
 [ 0.  0.  0.  0.  0.  1.  2. 11.]]
GNB:
 accuracy = 97.66666666666667, precision = 96.46800397877983, recall = 95.58576798721722, f1-score= 96.02485962252449


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictGNB = gnb.predict(X_test)
cf = confusion_matrix(y_test, predictGNB)
CfGNB = CfGNB + cf
print(CfGNB)
[acc3, pre3, rec3, f3] = calcprecall(CfGNB)
print("GNB:\n accuracy = {}, precision = {}, recall = {}, f1-score= {}".format(acc3,pre3,rec3,f3))

[[68.  0.  0.  0.  0.  0.  0.  0.]
 [ 0. 70.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 65.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 35.  0.  0.  0.  2.]
 [ 0.  0.  0.  0. 23.  0.  2.  0.]
 [ 0.  0.  0.  0.  0. 68.  0.  1.]
 [ 0.  0.  0.  2.  0.  0. 44.  0.]
 [ 0.  0.  0.  0.  0.  1.  3. 15.]]
GNB:
 accuracy = 97.24310776942356, precision = 95.7843213666195, recall = 94.96810769579649, f1-score= 95.37446827623313


In [52]:
alpha = 1
tdata = np.loadtxt("punjab_data.txt")

X = tdata[:,0:15]
Y = tdata[:,15]
m = X.shape[0]
n = X.shape[1]
skf = StratifiedKFold(n_splits=10)
classes = 9
CfGNB =np.zeros((classes,classes))
probs = np.array([]).reshape(0,classes)
entropy = np.array([]).reshape(0,1)
Xkf = np.array([]).reshape(0,n)
Ykf = np.array([]).reshape(0,1)
YNewkf = np.array([]).reshape(0,1)
pgnb = np.array([]).reshape(0,1)

names = ['B1','B2', 'B3', 'B4', 'B5', 'B6','B7','B8','B8A','B9','B10','B11', 'B12','VH','VV'];

for train_index, test_index in skf.split(X,Y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    m_test = X_test.shape[0]
    y_test_new  = np.copy(y_test)
    
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictGNB = gnb.predict(X_test)
    p = gnb.predict_proba(X_test)
    logp = gnb.predict_log_proba(X_test)
    D = np.sum(p*logp,axis=1).reshape(m_test,1)
    
    cf = confusion_matrix(y_test, predictGNB)
    CfGNB = CfGNB + cf
    probs = np.concatenate([probs,p])
    entropy = np.concatenate([entropy,D])
    
    Xkf = np.concatenate([Xkf,X_test])
    Ykf = np.concatenate([Ykf,y_test.reshape(m_test,1)])
    YNewkf = np.concatenate([YNewkf,y_test_new.reshape(m_test,1)])
    pgnb = np.concatenate([pgnb,predictGNB.reshape(m_test,1)])
    
print(CfGNB)
[acc3, pre3, rec3, f3] = calcprecall(CfGNB)
print("GNB:\n accuracy = {}, precision = {}, recall = {}, f1-score= {}".format(acc3,pre3,rec3,f3))
indexArraySorted = np.argsort(np.squeeze(entropy))
print(entropy[indexArraySorted])
index = int(alpha*(m-1))
T = entropy[indexArraySorted[index]]
print(T)
for k in range(m):
    if((entropy[k]<T) & (pgnb[k] != Ykf[k])):
        #relabel
        YNewkf[k] = pgnb[k] 

print("number of labels updated: {}".format(np.sum(YNewkf!=Ykf)))

filteredIndices = np.where(entropy<T)[0]
filteredX = np.take(Xkf, filteredIndices, axis = 0)
filteredY = np.take(YNewkf,filteredIndices).reshape(filteredX.shape[0],1)
filteredData = np.concatenate([filteredX,filteredY],axis = 1)
print(filteredData.shape)
print("land: {}".format(np.sum(filteredY == 0)))
print("fallow: {}".format(np.sum(filteredY == 1)))
print("water: {}".format(np.sum(filteredY == 2)))
print("fodder: {}".format(np.sum(filteredY == 3)))
print("wheat: {}".format(np.sum(filteredY == 4)))
print("gram: {}".format(np.sum(filteredY == 5)))
print("maize: {}".format(np.sum(filteredY == 6)))
print("vegetable: {}".format(np.sum(filteredY == 7)))
print("trees: {}".format(np.sum(filteredY == 8)))

filteredData = filteredData[filteredData[:,15].argsort()]

wholeData = np.concatenate([Xkf,Ykf,YNewkf,pgnb,entropy],axis = 1)
wholeData = wholeData[wholeData[:,15].argsort()]
np.savetxt("data1.csv", wholeData, delimiter=",",fmt='%1.3f',header="B1,B2,B3,B4,B5,B6,B7,B8,B8A,B9,B10,B11,B12,VV,VH,Y,Y_new,predictions, entropy")

[[50.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0. 50.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 50.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 27.  5.  3.  6.  1.  8.]
 [ 0.  0.  0.  1. 23.  5. 10.  4.  7.]
 [ 0.  0.  0.  1.  1. 45.  2.  0.  1.]
 [ 0.  0.  0.  0.  4.  0. 35.  6.  5.]
 [ 0.  0.  0.  8. 11.  0. 13. 12.  6.]
 [ 0.  0.  0.  9. 11.  3. 19.  1.  7.]]
GNB:
 accuracy = 66.44444444444444, precision = 65.84840919239896, recall = 66.44444444444444, f1-score= 66.14508412298535
[[-1.32749747e+000]
 [-1.32025521e+000]
 [-1.25400577e+000]
 [-1.24408056e+000]
 [-1.23899826e+000]
 [-1.22700405e+000]
 [-1.22151168e+000]
 [-1.21022686e+000]
 [-1.18899576e+000]
 [-1.18035595e+000]
 [-1.17680258e+000]
 [-1.14386284e+000]
 [-1.11728202e+000]
 [-1.11432517e+000]
 [-1.10049884e+000]
 [-1.09956846e+000]
 [-1.09098413e+000]
 [-1.09064366e+000]
 [-1.09020632e+000]
 [-1.07946006e+000]
 [-1.07856966e+000]
 [-1.07755863e+000]
 [-1.07565666e+000]
 [-1.07336186e+000]
 [-1.05019107e+000]
 [-1.04492181e+000]
 [-1.

In [53]:
m = filteredData.shape[0]
subsets = 10
classes = 9
a = m

for i in range(1,classes):
    c = np.sum(filteredY == i)
    if(c < subsets):
        a = a - c
        classes = classes - 1
        print("{} th class rejected due to lower number of samples".format(i))


X = filteredData[0:a,0:15]
Y = filteredData[0:a,15]
skf = StratifiedKFold(n_splits=subsets)
Y = Y.reshape(X.shape[0],)
CfGNB =np.zeros(classes)
names = ['B1','B2', 'B3', 'B4', 'B5', 'B6','B7','B8','B8A','B9','B10','B11', 'B12','VH','VV'];

for train_index, test_index in skf.split(X,Y):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    # training and testing with baysian
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictGNB = gnb.predict(X_test)
    cf = confusion_matrix(y_test, predictGNB)
    CfGNB = CfGNB + cf

print(CfGNB)
[acc3, pre3, rec3, f3] = calcprecall(CfGNB)
print("GNB:\n accuracy = {}, precision = {}, recall = {}, f1-score= {}".format(acc3,pre3,rec3,f3))

[[50.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0. 49.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. 50.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 36.  0.  0.  3.  4.  3.]
 [ 0.  0.  0.  0. 40.  3.  6.  2.  4.]
 [ 0.  0.  0.  0.  3. 53.  0.  0.  0.]
 [ 0.  0.  0.  3.  0.  0. 75.  0.  7.]
 [ 0.  0.  0.  1.  5.  1.  2. 14.  1.]
 [ 0.  0.  0.  2.  5.  0.  4.  0. 23.]]
GNB:
 accuracy = 86.8596881959911, precision = 85.33645434340569, recall = 84.42740952331745, f1-score= 84.8794980747473
