# Encode 

In [1]:
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# 📦 Import needed package
from pathlib import Path
from typing import Tuple

import numpy as np
import regex as re
import pandas as pd

import spacy

In [3]:
# Loading training data
training_data_labels = pd.read_csv(
    r"/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv"
)
training_data = pd.DataFrame(columns=["article_id", "file_id", "text", "is_real"])
for i, row in training_data_labels.iterrows():
    article_id = int(row.id)
    real_text_id = row.real_text_id
    fake_text_id = 1 if real_text_id == 2 else 2

    # Get file paths to text
    files_path = Path(
        rf"/kaggle/input/fake-or-real-the-impostor-hunt/data/train/article_{str(article_id).zfill(4)}"
    )
    real_text_path = files_path / f"file_{real_text_id}.txt"
    fake_text_path = files_path / f"file_{fake_text_id}.txt"

    # file 1
    file_1_path = files_path / "file_1.txt"
    file_1 = file_1_path.read_text()
    is_real = 1 if real_text_id == 1 else 0

    training_data = pd.concat(
        [pd.DataFrame([[article_id, 1, file_1, is_real]], columns=training_data.columns), training_data],
        ignore_index=True,
    )

    # file 2
    file_2_path = files_path / "file_2.txt"
    file_2 = file_2_path.read_text()
    is_real = 1 if real_text_id == 2 else 0

    training_data = pd.concat(
        [pd.DataFrame([[article_id, 2, file_2, is_real]], columns=training_data.columns), training_data],
        ignore_index=True,
    )
# Show the first few rows of the training data
training_data.head()

Unnamed: 0,article_id,file_id,text,is_real
0,94,2,The new detector system was first tested on 30...,0
1,94,1,The new detector system was first tested on 30...,1
2,93,2,The observations of the Pluto-Charon binary an...,1
3,93,1,The observations of the Pluto-Charon system an...,0
4,92,2,FORS1 and FORS2 are early instruments of the V...,1


In [4]:
# Load test data
test_path = Path(r"/kaggle/input/fake-or-real-the-impostor-hunt/data/test")
test_data = pd.DataFrame(columns=["article_id", "file_id", "text"])
for test_path_article in test_path.glob("**/article_*"):
    article_id = int(re.findall("\d+", test_path_article.name)[0])

    # File 1
    file_1_path = test_path_article / "file_1.txt"
    file_1 = file_1_path.read_text()
    test_data = pd.concat(
        [pd.DataFrame([[article_id, 1, file_1]], columns=test_data.columns), test_data],
        ignore_index=True,
    )

    # File 2
    file_2_path = test_path_article / "file_2.txt"
    file_2 = file_2_path.read_text()
    test_data = pd.concat(
        [pd.DataFrame([[article_id, 2, file_2]], columns=test_data.columns), test_data],
        ignore_index=True,
    )

test_data.head()

Unnamed: 0,article_id,file_id,text
0,499,2,We have analyzed a group of stars similar to t...
1,499,1,"For centuries, humans have been fascinated by ..."
2,472,2,## Operationally Challenged: The Saga of the ...
3,472,1,A call for proposals to conduct scientific ver...
4,821,2,A key historical role of AGN (Active Galactic ...


In [5]:
NLP_EN = spacy.load("en_core_web_lg")
LEN_VECTOR = 300
def encode_text(text: str) -> np.ndarray:
    """
    Encode the text using spaCy's language model.
    Returns a 300-dimensional vector.
    """
    tokens = NLP_EN(text)
    tot_vector = np.zeros((LEN_VECTOR), dtype=np.float32)
    num_tokens = 0
    for token in tokens:
        if token.is_stop:
            continue
        if token.pos_ not in ["ADJ", "ADV", "NOUN", "PROPN"]:
            continue
        if not token.has_vector or token.vector_norm == 0.0:
            continue
        tot_vector += token.vector
        num_tokens += 1

    if num_tokens == 0:
        return np.zeros((LEN_VECTOR), dtype=np.float32)
    else:
        return tot_vector / num_tokens

token_vectors = [f"avg_token_vector_{i}" for i in range(LEN_VECTOR)]

for i, row in training_data.iterrows():
    avg_token_vector = encode_text(row.text)
    training_data.loc[i, token_vectors] = avg_token_vector

training_data.head()

Unnamed: 0,article_id,file_id,text,is_real,avg_token_vector_0,avg_token_vector_1,avg_token_vector_2,avg_token_vector_3,avg_token_vector_4,avg_token_vector_5,...,avg_token_vector_290,avg_token_vector_291,avg_token_vector_292,avg_token_vector_293,avg_token_vector_294,avg_token_vector_295,avg_token_vector_296,avg_token_vector_297,avg_token_vector_298,avg_token_vector_299
0,94,2,The new detector system was first tested on 30...,0,-0.077769,0.252392,-0.014818,-0.000713,-0.141581,0.045727,...,-0.184035,0.122638,0.152933,0.055888,0.14945,-0.070476,-0.061196,0.024034,-0.012061,0.13358
1,94,1,The new detector system was first tested on 30...,1,-0.074893,0.26291,-0.03037,-0.002416,-0.104175,0.058519,...,-0.171178,0.102983,0.131961,0.046443,0.140925,-0.07078,-0.061558,0.028634,-0.027038,0.107565
2,93,2,The observations of the Pluto-Charon binary an...,1,0.022929,0.105878,-0.01268,0.036975,0.006194,0.201155,...,-0.040706,-0.020288,0.014817,0.115498,0.196464,0.038277,-0.001662,0.040387,-0.037965,0.112413
3,93,1,The observations of the Pluto-Charon system an...,0,0.032472,0.14041,-0.00782,0.03182,-0.014877,0.143297,...,-0.073805,-0.017368,0.045915,0.107447,0.164487,0.033983,-0.002963,0.016147,-0.001357,0.065037
4,92,2,FORS1 and FORS2 are early instruments of the V...,1,-0.139084,0.078683,-0.013101,-0.015956,-0.091714,0.057531,...,-0.270561,0.105031,0.09374,0.094304,0.067767,-0.002802,0.06245,-0.047142,-0.03005,0.081389


In [6]:
# for i, row in test_data.iterrows():
#     avg_token_vector = encode_text(row.text)
#     test_data.loc[i, token_vectors] = avg_token_vector

# test_data.head()

In [10]:
np.array(training_data[token_vectors]).shape

(190, 300)

In [13]:
# Nearest neighbor search
from sklearn.neighbors import NearestNeighbors

# Fit the model
train_array = np.array(training_data[token_vectors])
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(train_array)

# Find the nearest neighbors for each training data point
distances, indices = nbrs.kneighbors(train_array)

In [17]:
training_data.iloc[indices[0][1:]].is_real

1     1
97    1
96    0
68    1
22    1
Name: is_real, dtype: object