In [2]:
import gzip
import os.path as op
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

df = pd.read_csv("./osr_tweets_without_S_T_U_U.csv", engine='python')

#convert to list
docs = df.text

labels = df.topic

id2label = {}
label2id = {}
id_counter = 0
for i in range(len(labels)):
    label = labels[i]
    if label not in label2id:
        label2id[label] = id_counter
        id_counter += 1

for label, id in label2id.items():
    id2label[id] = label


for i in range(len(labels)):
    topic = labels[i]
    id = label2id[topic]
    labels[i] = id

df.text = df.text.astype(str)
df = df[df['text'].map(len) >= 10]
df.rename(columns={'topic':'label'}, inplace = True)
df_train, df_test = train_test_split(df, test_size=0.2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[i] = id


In [3]:
from tqdm import tqdm
from collections import Counter

k = 2

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["text"]
    test_label = row_test[1]["label"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []
    
    for row_train in df_train.iterrows():
        train_text = row_train[1]["text"]
        train_label = row_train[1]["label"]
        c_train_text = len(gzip.compress(train_text.encode()))
        
        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))
        
        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) )
        distance_from_test_instance.append(ncd)
        
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]
    
    predicted_classes.append(predicted_class)
        
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))

100%|███████████████████████████████████████| 2035/2035 [10:37<00:00,  3.19it/s]

Accuracy: 0.596068796068796





In [4]:
from tqdm import tqdm
from collections import Counter

k = 1

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["text"]
    test_label = row_test[1]["label"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []
    
    for row_train in df_train.iterrows():
        train_text = row_train[1]["text"]
        train_label = row_train[1]["label"]
        c_train_text = len(gzip.compress(train_text.encode()))
        
        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))
        
        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) ) 
        
        distance_from_test_instance.append(ncd)
    
    #distance_from_test_instance = [1/dis for dis in distance_from_test_instance]
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]
    
    predicted_classes.append(predicted_class)
        
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))

100%|███████████████████████████████████████| 2035/2035 [10:33<00:00,  3.21it/s]

Accuracy: 0.596068796068796





In [11]:
# set k from [2,8], with weighted distance
import math
for k in range(2,9):
    predicted_classes = []

    for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
        test_text = row_test[1]["text"]
        test_label = row_test[1]["label"]
        c_test_text = len(gzip.compress(test_text.encode()))
        distance_from_test_instance = []

        for row_train in df_train.iterrows():
            train_text = row_train[1]["text"]
            train_label = row_train[1]["label"]
            c_train_text = len(gzip.compress(train_text.encode()))

            train_plus_test = " ".join([test_text, train_text])
            c_train_plus_test = len(gzip.compress(train_plus_test.encode()))

            ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                    / max(c_test_text, c_train_text) ) 

            # weighted distance
            weight = class_weight[train_label]
            ncd = ncd * math.log(1+weight)
            distance_from_test_instance.append(ncd)

        sorted_idx = np.argsort(np.array(distance_from_test_instance))
        top_k_class = np.array(df_train["label"])[sorted_idx[:k]]
        predicted_class = Counter(top_k_class).most_common()[0][0]

        predicted_classes.append(predicted_class)

    print("k:",k," Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))

100%|███████████████████████████████████████| 2037/2037 [12:26<00:00,  2.73it/s]


k: 2  Accuracy: 0.24104074619538537


100%|███████████████████████████████████████| 2037/2037 [11:39<00:00,  2.91it/s]


k: 3  Accuracy: 0.1134020618556701


100%|███████████████████████████████████████| 2037/2037 [12:04<00:00,  2.81it/s]


k: 4  Accuracy: 0.10652920962199312


100%|███████████████████████████████████████| 2037/2037 [11:36<00:00,  2.92it/s]


k: 5  Accuracy: 0.06823760432007854


100%|███████████████████████████████████████| 2037/2037 [11:09<00:00,  3.04it/s]


k: 6  Accuracy: 0.06578301423662249


100%|███████████████████████████████████████| 2037/2037 [11:36<00:00,  2.93it/s]


k: 7  Accuracy: 0.054000981836033385


100%|███████████████████████████████████████| 2037/2037 [11:37<00:00,  2.92it/s]

k: 8  Accuracy: 0.05351006381934217



