# Natural Language Processing with spacy

This notebook will walk through basic spaCy operations for NLP, including tokenization, POS tagging, Named Entity Recognition, dependency parsing, and more.

## Libraries and settings

In [1]:
# Libraries
import os
import spacy
import pandas as pd
from spacy import displacy
from spacy.language import Language

# Download spaCy's pre-trained language model
def install_spacy_model(model_name):
    try:
        spacy.load(model_name)
        print(f"Model '{model_name}' is already installed.")
    except OSError:
        print(f"Model '{model_name}' not found. Installing...")
        os.system(f"python -m spacy download {model_name}")

# Check and install spaCy's pre-trained language model if not available
install_spacy_model("en_core_web_sm")

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

Model 'en_core_web_sm' not found. Installing...
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
/workspaces/python_data_preparation



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Loading the spaCy Model
We'll load the English language model using the `spacy.load()` function.

In [2]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

## Basic Text Processing
Let's process some text and extract individual tokens.

In [3]:
import pandas as pd
import spacy

# Load a spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample Data with apartment description
data = {
    'description': [
        "Luxurious 3-bedroom apartment with mountain view and high-end finishes in the city of Zürich, Switzerland."
    ]
}
df = pd.DataFrame(data)

# Process each description in the DataFrame
for description in df['description']:
    doc = nlp(description)
    print(f"Processing description: {description}")
    # Display tokens
    for token in doc:
        print(token.text)
    print("\n")

Processing description: Luxurious 3-bedroom apartment with mountain view and high-end finishes in the city of Zürich, Switzerland.
Luxurious
3
-
bedroom
apartment
with
mountain
view
and
high
-
end
finishes
in
the
city
of
Zürich
,
Switzerland
.




## Part-of-Speech (POS) Tagging
Now, we can look at the POS tags for each word.

In [4]:
# Display tokens with their POS tags
for token in doc:
    print(f'{token.text:10} {token.pos_}')

Luxurious  ADJ
3          NUM
-          PUNCT
bedroom    NOUN
apartment  NOUN
with       ADP
mountain   NOUN
view       NOUN
and        CCONJ
high       ADJ
-          PUNCT
end        NOUN
finishes   NOUN
in         ADP
the        DET
city       NOUN
of         ADP
Zürich     PROPN
,          PUNCT
Switzerland PROPN
.          PUNCT


## Named Entity Recognition (NER)
Next, we will use spaCy to identify entities in a text, such as names, dates, organizations, etc.

In [5]:
# Named entity recognition
for ent in doc.ents:
    print(f'{ent.text} {ent.label_}')

3 CARDINAL
Zürich GPE
Switzerland GPE


## Dependency Parsing
We'll also explore dependency parsing, which analyzes the grammatical relationships between words.

In [6]:
# Display the syntactic dependency structure
for token in doc:
    print(f'{token.text:10} {token.dep_:10} {token.head.text:10}')

Luxurious  amod       apartment 
3          nummod     bedroom   
-          punct      bedroom   
bedroom    compound   apartment 
apartment  ROOT       apartment 
with       prep       apartment 
mountain   compound   view      
view       pobj       with      
and        cc         view      
high       amod       end       
-          punct      end       
end        compound   finishes  
finishes   conj       view      
in         prep       apartment 
the        det        city      
city       pobj       in        
of         prep       city      
Zürich     pobj       of        
,          punct      Zürich    
Switzerland appos      Zürich    
.          punct      apartment 


## Visualization of entities and dependencies
We can use spaCy's `displacy` visualizer to display the entities and dependencies in a visually intuitive format.

In [7]:
# Visualize named entities
displacy.render(doc, style="ent")

# Visualize dependency parsing
displacy.render(doc, style="dep", jupyter=True)

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [8]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-10 20:31:05
Python Version: 3.11.10
-----------------------------------
