# Encode 

In [2]:
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
# 📦 Import needed package
from pathlib import Path
from typing import Tuple

import numpy as np
import regex as re
import pandas as pd

import spacy

In [4]:
# Loading training data
training_data_labels = pd.read_csv(
    r"/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv"
)
training_data = pd.DataFrame(columns=["article_id", "file_id", "text", "is_real"])
for i, row in training_data_labels.iterrows():
    article_id = int(row.id)
    real_text_id = row.real_text_id
    fake_text_id = 1 if real_text_id == 2 else 2

    # Get file paths to text
    files_path = Path(
        rf"/kaggle/input/fake-or-real-the-impostor-hunt/data/train/article_{str(article_id).zfill(4)}"
    )
    real_text_path = files_path / f"file_{real_text_id}.txt"
    fake_text_path = files_path / f"file_{fake_text_id}.txt"

    # file 1
    file_1_path = files_path / "file_1.txt"
    file_1 = file_1_path.read_text()
    is_real = 1 if real_text_id == 1 else 0

    training_data = pd.concat(
        [
            pd.DataFrame(
                [[article_id, 1, file_1, is_real]], columns=training_data.columns
            ),
            training_data,
        ],
        ignore_index=True,
    )

    # file 2
    file_2_path = files_path / "file_2.txt"
    file_2 = file_2_path.read_text()
    is_real = 1 if real_text_id == 2 else 0

    training_data = pd.concat(
        [
            pd.DataFrame(
                [[article_id, 2, file_2, is_real]], columns=training_data.columns
            ),
            training_data,
        ],
        ignore_index=True,
    )
# Show the first few rows of the training data
training_data.head()

Unnamed: 0,article_id,file_id,text,is_real
0,94,2,The new detector system was first tested on 30...,0
1,94,1,The new detector system was first tested on 30...,1
2,93,2,The observations of the Pluto-Charon binary an...,1
3,93,1,The observations of the Pluto-Charon system an...,0
4,92,2,FORS1 and FORS2 are early instruments of the V...,1


In [5]:
# Load test data
test_path = Path(r"/kaggle/input/fake-or-real-the-impostor-hunt/data/test")
test_data = pd.DataFrame(columns=["article_id", "file_id", "text"])
for test_path_article in test_path.glob("**/article_*"):
    article_id = int(re.findall("\d+", test_path_article.name)[0])

    # File 1
    file_1_path = test_path_article / "file_1.txt"
    file_1 = file_1_path.read_text()
    test_data = pd.concat(
        [pd.DataFrame([[article_id, 1, file_1]], columns=test_data.columns), test_data],
        ignore_index=True,
    )

    # File 2
    file_2_path = test_path_article / "file_2.txt"
    file_2 = file_2_path.read_text()
    test_data = pd.concat(
        [pd.DataFrame([[article_id, 2, file_2]], columns=test_data.columns), test_data],
        ignore_index=True,
    )

test_data.head()

  article_id = int(re.findall("\d+", test_path_article.name)[0])


Unnamed: 0,article_id,file_id,text
0,38,2,variegated functionalities provided by starga ...
1,38,1,The XClass software package allows astronomers...
2,880,2,The formal partnership between ESO and Chile b...
3,880,1,The formal relationship between ESO and Chile ...
4,491,2,India's burgeoning aerospace program is making...


In [6]:
NLP_EN = spacy.load("en_core_web_lg")
LEN_VECTOR = 300


def encode_text(text: str) -> np.ndarray:
    """
    Encode the text using spaCy's language model.
    Returns a 300-dimensional vector.
    """
    tokens = NLP_EN(text)
    tot_vector = np.zeros((LEN_VECTOR), dtype=np.float32)
    num_tokens = 0
    for token in tokens:
        if token.is_stop:
            continue
        if token.pos_ not in ["ADJ", "ADV", "NOUN", "PROPN"]:
            continue
        if not token.has_vector or token.vector_norm == 0.0:
            continue
        tot_vector += token.vector
        num_tokens += 1

    if num_tokens == 0:
        return np.zeros((LEN_VECTOR), dtype=np.float32)
    else:
        return tot_vector / num_tokens


token_vectors = [f"avg_token_vector_{i}" for i in range(LEN_VECTOR)]

for i, row in training_data.iterrows():
    avg_token_vector = encode_text(row.text)
    training_data.loc[i, token_vectors] = avg_token_vector

training_data.to_csv("training_with_tokens.csv", index=False)
training_data.head()

Unnamed: 0,article_id,file_id,text,is_real,avg_token_vector_0,avg_token_vector_1,avg_token_vector_2,avg_token_vector_3,avg_token_vector_4,avg_token_vector_5,...,avg_token_vector_290,avg_token_vector_291,avg_token_vector_292,avg_token_vector_293,avg_token_vector_294,avg_token_vector_295,avg_token_vector_296,avg_token_vector_297,avg_token_vector_298,avg_token_vector_299
0,94,2,The new detector system was first tested on 30...,0,-0.077769,0.252392,-0.014818,-0.000713,-0.141581,0.045727,...,-0.184035,0.122638,0.152933,0.055888,0.14945,-0.070476,-0.061196,0.024034,-0.012061,0.13358
1,94,1,The new detector system was first tested on 30...,1,-0.074893,0.26291,-0.03037,-0.002416,-0.104175,0.058519,...,-0.171178,0.102983,0.131961,0.046443,0.140925,-0.07078,-0.061558,0.028634,-0.027038,0.107565
2,93,2,The observations of the Pluto-Charon binary an...,1,0.022929,0.105878,-0.01268,0.036975,0.006194,0.201155,...,-0.040706,-0.020288,0.014817,0.115498,0.196464,0.038277,-0.001662,0.040387,-0.037965,0.112413
3,93,1,The observations of the Pluto-Charon system an...,0,0.032472,0.14041,-0.00782,0.03182,-0.014877,0.143297,...,-0.073805,-0.017368,0.045915,0.107447,0.164487,0.033983,-0.002963,0.016147,-0.001357,0.065037
4,92,2,FORS1 and FORS2 are early instruments of the V...,1,-0.139084,0.078683,-0.013101,-0.015956,-0.091714,0.057531,...,-0.270561,0.105031,0.09374,0.094304,0.067767,-0.002802,0.06245,-0.047142,-0.03005,0.081389


In [7]:
for i, row in test_data.iterrows():
    avg_token_vector = encode_text(row.text)
    test_data.loc[i, token_vectors] = avg_token_vector

test_data.to_csv("test_with_tokens.csv", index=False)
test_data.head()

Unnamed: 0,article_id,file_id,text,avg_token_vector_0,avg_token_vector_1,avg_token_vector_2,avg_token_vector_3,avg_token_vector_4,avg_token_vector_5,avg_token_vector_6,...,avg_token_vector_290,avg_token_vector_291,avg_token_vector_292,avg_token_vector_293,avg_token_vector_294,avg_token_vector_295,avg_token_vector_296,avg_token_vector_297,avg_token_vector_298,avg_token_vector_299
0,38,2,variegated functionalities provided by starga ...,-0.072686,0.077947,-0.095928,-0.006755,-0.16621,0.076054,-0.008783,...,-0.16265,0.136994,0.181699,0.082742,0.119881,-0.132444,-0.088043,0.019951,0.087751,0.104871
1,38,1,The XClass software package allows astronomers...,-0.13174,0.018881,-0.119064,-0.056425,-0.191263,0.206301,-0.012274,...,-0.357625,0.102077,0.0955,0.214108,0.044338,-0.063704,-0.108328,-0.033336,-0.001915,0.122139
2,880,2,The formal partnership between ESO and Chile b...,-0.064031,-0.08008,0.189729,-0.046376,0.036825,0.025267,-0.099314,...,-0.035031,0.121565,0.090863,0.093841,0.164084,-0.12057,0.052982,-0.099918,-0.011719,0.050613
3,880,1,The formal relationship between ESO and Chile ...,-0.060878,-0.091044,0.191662,-0.040135,0.04972,0.042741,-0.114613,...,-0.010813,0.12397,0.073517,0.082049,0.188433,-0.11697,0.048559,-0.086427,-0.02203,0.035631
4,491,2,India's burgeoning aerospace program is making...,-0.105118,0.173255,0.018863,-0.020586,-0.067639,0.004712,-0.065683,...,-0.170255,0.178009,-0.018299,0.064377,0.070335,-0.095826,-0.018371,0.072857,-0.040606,0.103267


In [8]:
# Nearest neighbor search
from sklearn.neighbors import NearestNeighbors

for num_neight in range(2, 96, 2):
    # Fit the model
    train_array = np.array(training_data[token_vectors])
    nbrs = NearestNeighbors(n_neighbors=num_neight, algorithm="ball_tree").fit(
        train_array
    )

    # Find the nearest neighbors for each training data point
    distances, indices = nbrs.kneighbors(train_array)

    all_correct = 0
    for indic in indices:
        prediction = training_data.iloc[indic[1:]].is_real.mode()[0]
        # First neighbor it always itself
        true_y = training_data.iloc[indic[0]].is_real
        all_correct += prediction == true_y

    print(
        f"acc training: {all_correct/len(training_data):.3f} with {num_neight-1} neighbors"
    )

acc training: 0.463 with 1 neighbors
acc training: 0.595 with 3 neighbors
acc training: 0.668 with 5 neighbors
acc training: 0.674 with 7 neighbors
acc training: 0.753 with 9 neighbors
acc training: 0.732 with 11 neighbors
acc training: 0.753 with 13 neighbors
acc training: 0.737 with 15 neighbors
acc training: 0.742 with 17 neighbors
acc training: 0.747 with 19 neighbors
acc training: 0.753 with 21 neighbors
acc training: 0.758 with 23 neighbors
acc training: 0.768 with 25 neighbors
acc training: 0.768 with 27 neighbors
acc training: 0.763 with 29 neighbors
acc training: 0.758 with 31 neighbors
acc training: 0.753 with 33 neighbors
acc training: 0.758 with 35 neighbors
acc training: 0.758 with 37 neighbors
acc training: 0.768 with 39 neighbors
acc training: 0.784 with 41 neighbors
acc training: 0.784 with 43 neighbors
acc training: 0.795 with 45 neighbors
acc training: 0.795 with 47 neighbors
acc training: 0.789 with 49 neighbors
acc training: 0.774 with 51 neighbors
acc training: 0.7

In [9]:
nbrs_test = NearestNeighbors(n_neighbors=45, algorithm="ball_tree").fit(train_array)

In [43]:
submission = pd.DataFrame(columns=["id", "real_text_id"])
test_array = np.array(test_data[token_vectors])

distances, indices = nbrs_test.kneighbors(test_array)
for article_id in range(test_data.article_id.max()):
    index_file_1 = test_data[
        (test_data.article_id == article_id) & (test_data.file_id == 1)
    ].index
    index_file_2 = test_data[
        (test_data.article_id == article_id) & (test_data.file_id == 2)
    ].index

    indic_1 = indices[index_file_1]
    indic_2 = indices[index_file_2]

    pred_1 = training_data.iloc[indic_1[0]].is_real.mean()
    pred_2 = training_data.iloc[indic_2[0]].is_real.mean()

    # Get the highest prediction
    if pred_1 > pred_2:
        submission = pd.concat(
            [pd.DataFrame([{"id": article_id, "real_text_id": 1}]), submission]
        )
        continue
    elif pred_2 > pred_1:
        submission = pd.concat(
            [pd.DataFrame([{"id": article_id, "real_text_id": 2}]), submission]
        )
        continue

    # If prediction are equal get the lowest distant
    dist_1 = distances[index_file_1].mean()
    dist_2 = distances[index_file_2].mean()
    if dist_2 > dist_1:
        submission = pd.concat(
            [pd.DataFrame([{"id": article_id, "real_text_id": 1}]), submission]
        )
        continue
    else:
        submission = pd.concat(
            [pd.DataFrame([{"id": article_id, "real_text_id": 2}]), submission]
        )
        continue

submission = submission.sort_values(by="id")
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,id,real_text_id
0,0,2
0,1,2
0,2,1
0,3,1
0,4,2
