In [1]:
import pandas as pd
import math

In [2]:
data_training = {
                    'doc1' : ['Chinese', 'Beijing', 'Chinese'], 
                    'doc2' : ['Chinese', 'Chinese', 'Shanghai', 'Tokyo'], 
                    'doc3' : ['Chinese', 'Macao'],
                    'doc4' : ['Tokyo', 'Japan', 'Chinese'],
                }
data_testing = {
                    'doc5' : ['Chinese', 'Chinese', 'Chinese', 'Japan', 'Tokyo'],
               }


In [3]:
data_training

{'doc1': ['Chinese', 'Beijing', 'Chinese'],
 'doc2': ['Chinese', 'Chinese', 'Shanghai', 'Tokyo'],
 'doc3': ['Chinese', 'Macao'],
 'doc4': ['Tokyo', 'Japan', 'Chinese']}

In [4]:
data_testing

{'doc5': ['Chinese', 'Chinese', 'Chinese', 'Japan', 'Tokyo']}

In [5]:
data_full = data_training | data_testing
data_full

{'doc1': ['Chinese', 'Beijing', 'Chinese'],
 'doc2': ['Chinese', 'Chinese', 'Shanghai', 'Tokyo'],
 'doc3': ['Chinese', 'Macao'],
 'doc4': ['Tokyo', 'Japan', 'Chinese'],
 'doc5': ['Chinese', 'Chinese', 'Chinese', 'Japan', 'Tokyo']}

In [6]:
list_of_terms = list(set([t for doc, term in data_full.items() for t in term]))
list_of_terms

['Macao', 'Tokyo', 'Shanghai', 'Beijing', 'Japan', 'Chinese']

In [7]:
tdm = {}
for docs, terms in data_full.items():
    term_counts = {term: 0 for term in list_of_terms}
    for term in terms:
        if term in term_counts: 
            term_counts[term] += 1
    tdm[docs] = term_counts
tdm

{'doc1': {'Macao': 0,
  'Tokyo': 0,
  'Shanghai': 0,
  'Beijing': 1,
  'Japan': 0,
  'Chinese': 2},
 'doc2': {'Macao': 0,
  'Tokyo': 1,
  'Shanghai': 1,
  'Beijing': 0,
  'Japan': 0,
  'Chinese': 2},
 'doc3': {'Macao': 1,
  'Tokyo': 0,
  'Shanghai': 0,
  'Beijing': 0,
  'Japan': 0,
  'Chinese': 1},
 'doc4': {'Macao': 0,
  'Tokyo': 1,
  'Shanghai': 0,
  'Beijing': 0,
  'Japan': 1,
  'Chinese': 1},
 'doc5': {'Macao': 0,
  'Tokyo': 1,
  'Shanghai': 0,
  'Beijing': 0,
  'Japan': 1,
  'Chinese': 3}}

In [8]:
df_tdm = pd.DataFrame(tdm)
df_tdm

Unnamed: 0,doc1,doc2,doc3,doc4,doc5
Macao,0,0,1,0,0
Tokyo,0,1,0,1,1
Shanghai,0,1,0,0,0
Beijing,1,0,0,0,0
Japan,0,0,0,1,1
Chinese,2,2,1,1,3


In [9]:
df_tdm_transpose = df_tdm.T
df_tdm_transpose

Unnamed: 0,Macao,Tokyo,Shanghai,Beijing,Japan,Chinese
doc1,0,0,0,1,0,2
doc2,0,1,1,0,0,2
doc3,1,0,0,0,0,1
doc4,0,1,0,0,1,1
doc5,0,1,0,0,1,3


In [10]:
df_tdm_transpose['Class'] = ['Yes', 'Yes', 'Yes', 'No', None]
df_tdm_transpose

Unnamed: 0,Macao,Tokyo,Shanghai,Beijing,Japan,Chinese,Class
doc1,0,0,0,1,0,2,Yes
doc2,0,1,1,0,0,2,Yes
doc3,1,0,0,0,0,1,Yes
doc4,0,1,0,0,1,1,No
doc5,0,1,0,0,1,3,


In [11]:
def cosine_similarity(data_x, data_y) -> float:
    numerator = sum(x * y for x, y in zip(data_x, data_y))
    denominator = math.sqrt(sum([x ** 2 for x in data_x])) * math.sqrt(sum([y ** 2 for y in data_y]))
    return numerator / denominator if denominator != 0 else 0

def manhattan_distance(data_x, data_y) -> float:
    return float(sum(abs(x - y) for x, y in zip(data_x, data_y)))

def euclidean_distance(data_x, data_y) -> float:
    return math.sqrt(sum((x - y) ** 2 for x, y in zip(data_x, data_y)))

def simple_weight(d_i, d_1, d_k) -> float:
    if d_k != d_1:
        return (d_k - d_i) / (d_k - d_1)
    else:
        return 1.0

def compute_distance(row_test, row_train, distance):
    if distance == "manhattan":
        return manhattan_distance(row_test, row_train)
    elif distance == "euclidean":
        return euclidean_distance(row_test, row_train)
    elif distance == "cosine":
        return cosine_similarity(row_test, row_train)
    else:
        raise NotImplementedError(f"Distance measure '{distance}' is not implemented")

def compute_weights(k_nearest, weighted):
    d_1 = k_nearest[0][1]
    d_k = k_nearest[-1][1]
    if weighted == 'dwknn':
        return [dual_weight(k_nearest[i][1], d_1, d_k) for i in range(len(k_nearest))]
    elif weighted == 'wknn':
        return [simple_weight(k_nearest[i][1], d_1, d_k) for i in range(len(k_nearest))]
    else:
        return [1] * len(k_nearest)

def knn_classifier(df_train, df_test, k, distance, weighted='none'):
    predictions = []
    
    for _, row_test in df_test.iterrows():
        
        # LANGKAH ke-1: Hitung jarak antara data testing ke setiap data training
        distances = [
            (index_train, compute_distance(row_test.drop('Class'), row_train.drop('Class'), distance))
            for index_train, row_train in df_train.iterrows()
        ]
        
        print("D (Before Sorting): ", distances)
        
        # LANGKAH 2: Urutkan hasil perhitungan jarak terdeket (pengecualian untuk metode cosine)
        distances.sort(key=lambda x: x[1], reverse=(distance == "cosine"))
        print("D (After sorting): ", distances)
        
        # LANGKAH 3: Pilih jumlah k nearest neigbors (k jumlah tetangga terdekat)
        k_nearest = distances[:k]
        print(f"K ({len(k_nearest)}): {k_nearest}")
        
        # LANGKAH 4: HITUNG BOBOT (Langkah ini opsional untuk KNN Original)
        weights = compute_weights(k_nearest, weighted)
        print("Weights: ", weights)
        
        # LANGKAH 5: AMbil label (y) dari setiap tetangga terdekat dan implementasikan weight jika langkah 4 dilakukan
        nearest_labels = [(df_train.loc[index]['Class'], weight) for (index, _), weight in zip(k_nearest, weights)]
        print("Nearest Labels: ", nearest_labels)
        
        # LANGKAH 6: Buat prediksi berdasarkan majority voting (penjumlahan bobot dilakukan jika menggunakan Weighted KNN)
        class_votes = {}
        for label, weight in nearest_labels:
            class_votes[label] = class_votes.get(label, 0) + weight
        print("Voting: ", class_votes)
        
        # LAGKAH 7: Mencari argmax/label dengan jarak terkecil atau jika mengaplikasikan WKNN, maka yg dicari adalah label yg memiliki jumlah bobot terbanyak dijadkan sebagai label prediksi
        argmax = max(class_votes.items(), key=lambda x: x[1])
        print("Argmax: ", argmax)
        
        prediction = argmax[0]        
        
        predictions.append((row_test.name, prediction))
    
    return predictions

In [12]:
data1 = [1, 2, 2, 3, -3]
data2 = [1, 2, 2, 3, -3]

print(f"Cosine Simmilarity: {cosine_similarity(data1, data2)}") # mencari kemiripan, semakin mendekati nilai 1 maka tingkat kemiripannya tinggi
print(f"Manhattan Distance: {manhattan_distance(data1, data2)}") # mencari jarak/beda/selisih, semakin mendekati nilai 0 maka jaraknya semakin dekat\
print(f"Euclidean Distance: {euclidean_distance(data1, data2)}") # mencari jarak/beda/selisih, semakin mendekati nilai 0 maka jaraknya semakin dekat\

Cosine Simmilarity: 1.0
Manhattan Distance: 0.0
Euclidean Distance: 0.0


In [13]:
df_train = df_tdm_transpose.iloc[:4]
df_train

Unnamed: 0,Macao,Tokyo,Shanghai,Beijing,Japan,Chinese,Class
doc1,0,0,0,1,0,2,Yes
doc2,0,1,1,0,0,2,Yes
doc3,1,0,0,0,0,1,Yes
doc4,0,1,0,0,1,1,No


In [14]:
df_test =  df_tdm_transpose.iloc[4:]
df_test

Unnamed: 0,Macao,Tokyo,Shanghai,Beijing,Japan,Chinese,Class
doc5,0,1,0,0,1,3,


## KNN

In [15]:
k = [x for x in range(1, len(df_train) + 1)]
distance = ['manhattan', 'euclidean', 'cosine']
for x in k:
    for y in distance:
        predictions = knn_classifier(df_train, df_test, x, y)
        print(f"\tK: {x} | Distance: {y} | Predictions: {predictions}\n")
    print()

D (Before Sorting):  [('doc1', 4.0), ('doc2', 3.0), ('doc3', 5.0), ('doc4', 2.0)]
D (After sorting):  [('doc4', 2.0), ('doc2', 3.0), ('doc1', 4.0), ('doc3', 5.0)]
K (1): [('doc4', 2.0)]
Weights:  [1]
Nearest Labels:  [('No', 1)]
Voting:  {'No': 1}
Argmax:  ('No', 1)
	K: 1 | Distance: manhattan | Predictions: [('doc5', 'No')]

D (Before Sorting):  [('doc1', 2.0), ('doc2', 1.7320508075688772), ('doc3', 2.6457513110645907), ('doc4', 2.0)]
D (After sorting):  [('doc2', 1.7320508075688772), ('doc1', 2.0), ('doc4', 2.0), ('doc3', 2.6457513110645907)]
K (1): [('doc2', 1.7320508075688772)]
Weights:  [1]
Nearest Labels:  [('Yes', 1)]
Voting:  {'Yes': 1}
Argmax:  ('Yes', 1)
	K: 1 | Distance: euclidean | Predictions: [('doc5', 'Yes')]

D (Before Sorting):  [('doc1', 0.8090398349558905), ('doc2', 0.8616404368553293), ('doc3', 0.6396021490668313), ('doc4', 0.8703882797784892)]
D (After sorting):  [('doc4', 0.8703882797784892), ('doc2', 0.8616404368553293), ('doc1', 0.8090398349558905), ('doc3', 0.6

 ## Weighted KNN (WKNN)

In [16]:
k = [x for x in range(1, len(df_train) + 1)]
distance = ['manhattan', 'euclidean', 'cosine']
for x in k:
    for y in distance:
        predictions = knn_classifier(df_train, df_test, x, y, weighted='wknn')
        print(f"\tK: {x} | Distance: {y} | Predictions: {predictions}\n")
    print()

D (Before Sorting):  [('doc1', 4.0), ('doc2', 3.0), ('doc3', 5.0), ('doc4', 2.0)]
D (After sorting):  [('doc4', 2.0), ('doc2', 3.0), ('doc1', 4.0), ('doc3', 5.0)]
K (1): [('doc4', 2.0)]
Weights:  [1.0]
Nearest Labels:  [('No', 1.0)]
Voting:  {'No': 1.0}
Argmax:  ('No', 1.0)
	K: 1 | Distance: manhattan | Predictions: [('doc5', 'No')]

D (Before Sorting):  [('doc1', 2.0), ('doc2', 1.7320508075688772), ('doc3', 2.6457513110645907), ('doc4', 2.0)]
D (After sorting):  [('doc2', 1.7320508075688772), ('doc1', 2.0), ('doc4', 2.0), ('doc3', 2.6457513110645907)]
K (1): [('doc2', 1.7320508075688772)]
Weights:  [1.0]
Nearest Labels:  [('Yes', 1.0)]
Voting:  {'Yes': 1.0}
Argmax:  ('Yes', 1.0)
	K: 1 | Distance: euclidean | Predictions: [('doc5', 'Yes')]

D (Before Sorting):  [('doc1', 0.8090398349558905), ('doc2', 0.8616404368553293), ('doc3', 0.6396021490668313), ('doc4', 0.8703882797784892)]
D (After sorting):  [('doc4', 0.8703882797784892), ('doc2', 0.8616404368553293), ('doc1', 0.809039834955890