# Encode 

In [7]:
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
# 📦 Import needed package
import regex as re
from typing import Tuple

import pandas as pd
from pathlib import Path

import spacy

In [2]:
# Loading training data
training_data = pd.read_csv(
    r"/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv"
)

for i, row in training_data.iterrows():
    id = int(row.id)
    real_text_id = row.real_text_id
    fake_text_id = 1 if real_text_id == 2 else 2

    # Get file paths to text
    files_path = Path(
        rf"/kaggle/input/fake-or-real-the-impostor-hunt/data/train/article_{str(id).zfill(4)}"
    )
    real_text_path = files_path / f"file_{real_text_id}.txt"
    fake_text_path = files_path / f"file_{fake_text_id}.txt"

    # Load texts
    real_text = real_text_path.read_text()
    fake_text = fake_text_path.read_text()
    training_data.loc[i, "real_text"] = real_text
    training_data.loc[i, "fake_text"] = fake_text

training_data.head()

Unnamed: 0,id,real_text_id,real_text,fake_text
0,0,1,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...
1,1,2,The project aims to achieve an accuracy level ...,China\nThe goal of this project involves achie...
2,2,1,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...
3,3,2,The importance for understanding how stars evo...,China\nThe study suggests that multiple star s...
4,4,2,Analyzing how fast stars rotate within a galax...,Dinosaur Rex was excited about his new toy set...


In [3]:
# Load test data
test_path = Path(r"/kaggle/input/fake-or-real-the-impostor-hunt/data/test")
test_data = pd.DataFrame(columns=["article_id", "file_id", "text"])
for test_path_article in test_path.glob("**/article_*"):
    article_id = int(re.findall("\d+", test_path_article.name)[0])

    # File 1
    file_1_path = test_path_article / "file_1.txt"
    file_1 = file_1_path.read_text()
    test_data = pd.concat(
        [pd.DataFrame([[article_id, 1, file_1]], columns=test_data.columns), test_data],
        ignore_index=True,
    )

    # File 2
    file_2_path = test_path_article / "file_2.txt"
    file_2 = file_2_path.read_text()
    test_data = pd.concat(
        [pd.DataFrame([[article_id, 2, file_2]], columns=test_data.columns), test_data],
        ignore_index=True,
    )

test_data.head()

Unnamed: 0,article_id,file_id,text
0,499,2,We have analyzed a group of stars similar to t...
1,499,1,"For centuries, humans have been fascinated by ..."
2,472,2,## Operationally Challenged: The Saga of the ...
3,472,1,A call for proposals to conduct scientific ver...
4,821,2,A key historical role of AGN (Active Galactic ...


In [8]:
nlp = spacy.load("en_core_web_lg")

for i, row in training_data.iterrows():
    tokens = nlp(row.real_text)

    for token in tokens:
        print(token.text, token.has_vector, token.vector_norm, token.pos_)
    break

The True 4.70935 DET
VIRSA False 0.0 PROPN
( True 5.781955 PUNCT
Visible True 6.0000844 PROPN
Infrared True 7.4647846 PROPN
Survey True 6.39114 PROPN
Telescope True 7.4394827 PROPN
Array True 6.708615 PROPN
) True 5.489806 PUNCT
project True 6.227214 NOUN
produces True 6.013707 VERB
vast True 5.9050612 ADJ
amounts True 6.317014 NOUN
of True 4.97793 ADP
high True 5.846882 ADJ
quality True 6.5330544 NOUN
astronomical True 6.719604 ADJ
datasets True 6.549685 NOUN
used True 5.209864 VERB
extensively True 5.6134586 ADV
across True 5.783371 ADP
various True 5.5481405 ADJ
fields True 6.7900715 NOUN
within True 5.622528 ADP
astronomy True 7.342365 NOUN
due True 5.6916223 ADP
to True 4.74484 ADP
its True 5.6994023 PRON
rigorous True 6.538415 ADJ
quality True 6.5330544 NOUN
control True 6.4051557 NOUN
procedures True 6.9442024 NOUN
involving True 5.854047 VERB
continuous True 6.230595 ADJ
monitoring True 6.7755203 NOUN
system True 6.458484 NOUN
stability True 6.836838 NOUN
checks True 5.7522135 