In [4]:
import spacy
from spacy import displacy
import pandas as pd
import re

In [5]:
from spacy.cli import download
download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
text = ''
with open('resources/hp_genz.txt', 'r', encoding='utf-8') as file:
    text = file.read()
# cleanse, remove punctuation
text_no_newline = re.sub('\n', ' ', text)
text_no_punc = re.sub(r'[^\w\s]', '', text_no_newline)
print(text_no_punc)

 Chapter One  THE BOY WHO WASNT UNALIVED  Mr and Mrs Dursley of number four Privet Drive liked flexing that they were very basic thank u Tbh they were the last people youd think would be sus because they were all fax no printer  Mr Dursley was adulting at a firm called Grunnings which made drills  He was a dummy thiccc w three Cs man with hardly any neck although he had an absolute unit of a mustache Mrs Dursley was a total Karen with zero chill and had hella neck which came in very useful when she was stalking her neighbours and not minding her own  The Dursleys had a future incel of a son named Dudley who they thought was the main character The Dursleys were mostly thriving but they also had lowkey tea which didnt pass the vibe check and their greatest fear was to get called out and cancelled They were girlbossing too close to the sun and didnt think their clout could bounce back if their fam the Potters were revealed Milf Lily Potter was Mrs Dursleys sis but Mrs D had gone ghost irl

In [8]:
spacy_doc = nlp(text_no_punc)

In [9]:
for token in spacy_doc[:10]:
    print(token.text, token.pos_)

  SPACE
Chapter PROPN
One NUM
  SPACE
THE DET
BOY PROPN
WHO PRON
WASNT VERB
UNALIVED ADJ
  SPACE


In [10]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag', 'pos'])

In [11]:
# get pos tag for each token in text
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_, 'pos': token.pos}])] , ignore_index=True)

In [12]:
pos_df.head(20)

Unnamed: 0,token,pos_tag,pos
0,,SPACE,103
1,Chapter,PROPN,96
2,One,NUM,93
3,,SPACE,103
4,THE,DET,90
5,BOY,PROPN,96
6,WHO,PRON,95
7,WASNT,VERB,100
8,UNALIVED,ADJ,84
9,,SPACE,103


In [13]:
count_by_token = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='count').sort_values('count', ascending=False)

In [14]:
count_by_token.head(20)

Unnamed: 0,token,pos_tag,count
1225,the,DET,161
0,,SPACE,132
256,and,CCONJ,87
230,a,DET,84
680,he,PRON,71
1332,was,AUX,62
706,his,PRON,54
929,of,ADP,52
924,nt,PART,46
737,in,ADP,43


In [15]:
count_by_pos = pos_df.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)

In [16]:
count_by_pos.head(20)

pos_tag
NOUN     661
VERB     601
PRON     503
PROPN    395
ADP      354
DET      329
AUX      262
ADV      191
ADJ      185
SPACE    135
CCONJ    127
SCONJ    106
PART      96
NUM       23
INTJ      20
PUNCT      2
Name: token, dtype: int64

In [17]:
for token in spacy_doc.ents[:10]:
    print(token.text, token.label_)

Chapter One LAW
Mrs Dursley PERSON
four CARDINAL
Privet Drive FAC
Dursley PERSON
Grunnings ORG
three CARDINAL
Cs ORG
Mrs Dursley PERSON
Karen PERSON


In [18]:
# labeling
from spacy import displacy
#from IPython.display import display_pdf
from IPython.core.display import *
displacy.render(spacy_doc, style="ent", jupyter=True)