# Spacy NLP

In [30]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import nltk

In [2]:
df = pd.read_csv('data/train.dat', sep="\t", header=None)

In [3]:
df.rename(columns={0:'condition', 1:'abstract'}, inplace=True)
df.head()

Unnamed: 0,condition,abstract
0,4,Catheterization laboratory events and hospital...
1,5,Renal abscess in children. Three cases of rena...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...
3,5,Subclavian artery to innominate vein fistula a...
4,4,Effect of local inhibition of gamma-aminobutyr...


In [4]:
nlp = spacy.load("en_core_web_sm")

In [17]:
lexical_df = df.copy()

In [19]:
sentences = []
for record in lexical_df['abstract']:
    doc = nlp(record)
    sentences.append([sentence.text for sentence in doc.sents])

In [20]:
lexical_df['sentences'] = sentences

In [21]:
lexical_df.head()

Unnamed: 0,condition,abstract,sentences
0,4,Catheterization laboratory events and hospital...,[Catheterization laboratory events and hospita...
1,5,Renal abscess in children. Three cases of rena...,"[Renal abscess in children., Three cases of re..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,[Hyperplastic polyps seen at sigmoidoscopy are...
3,5,Subclavian artery to innominate vein fistula a...,[Subclavian artery to innominate vein fistula ...
4,4,Effect of local inhibition of gamma-aminobutyr...,[Effect of local inhibition of gamma-aminobuty...


In [27]:
words = []
lemmatization = []

stemming = []
porterStemmer = nltk.PorterStemmer()

pos_tag = []

for record in df["abstract"]:
    doc = nlp(record)
    tokenized_record = []
    lemmatized_record = []
    stemmed_record = []
    pos_tag_record = []
    for token in doc:
        word = token.text # Tokenization
        tokenized_record.append(word)

        lemmatized_record.append(token.lemma_) # Lemmatization

        stemmed_record.append(porterStemmer.stem(word)) # Stemming

        pos_tag_record.append(token.tag_) # POS Tagging

        # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

    words.append(tokenized_record)
    lemmatization.append(lemmatized_record)
    stemming.append(stemmed_record)
    pos_tag.append(pos_tag_record)

In [28]:
lexical_df['words'] = words
lexical_df['lemmatization'] = lemmatization
lexical_df['stemming'] = stemming
lexical_df['pos_tag'] = pos_tag

In [29]:
lexical_df.head()

Unnamed: 0,condition,abstract,sentences,words,lemmatization,stemming,pos_tag
0,4,Catheterization laboratory events and hospital...,[Catheterization laboratory events and hospita...,"[Catheterization, laboratory, events, and, hos...","[catheterization, laboratory, event, and, hosp...","[catheter, laboratori, event, and, hospit, out...","[NN, NN, NNS, CC, NN, NN, IN, JJ, NN, IN, JJ, ..."
1,5,Renal abscess in children. Three cases of rena...,"[Renal abscess in children., Three cases of re...","[Renal, abscess, in, children, ., Three, cases...","[Renal, abscess, in, child, ., three, case, of...","[renal, abscess, in, children, ., three, case,...","[NNP, NN, IN, NNS, ., CD, NNS, IN, JJ, NNS, IN..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,[Hyperplastic polyps seen at sigmoidoscopy are...,"[Hyperplastic, polyps, seen, at, sigmoidoscopy...","[hyperplastic, polyp, see, at, sigmoidoscopy, ...","[hyperplast, polyp, seen, at, sigmoidoscopi, a...","[JJ, NNS, VBN, IN, NN, VBP, NNS, IN, JJ, NN, V..."
3,5,Subclavian artery to innominate vein fistula a...,[Subclavian artery to innominate vein fistula ...,"[Subclavian, artery, to, innominate, vein, fis...","[subclavian, artery, to, innominate, vein, fis...","[subclavian, arteri, to, innomin, vein, fistul...","[JJ, NN, TO, VB, JJ, NN, IN, NN, IN, DT, NN, N..."
4,4,Effect of local inhibition of gamma-aminobutyr...,[Effect of local inhibition of gamma-aminobuty...,"[Effect, of, local, inhibition, of, gamma, -, ...","[effect, of, local, inhibition, of, gamma, -, ...","[effect, of, local, inhibit, of, gamma, -, ami...","[NN, IN, JJ, NN, IN, NN, HYPH, JJ, NN, NN, IN,..."


In [31]:
# Crea una copia del dataframe per l'analisi sintattica
syntax_df = lexical_df.copy()

In [32]:
# Definisci il pattern per il chunking
pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]  # Pattern per due nomi propri consecutivi

# Inizializza il matcher con il vocabolario del modello
matcher = Matcher(nlp.vocab)
matcher.add("NP", [pattern])

# Applica il chunking a ciascun record
chunking = []
for record in lexical_df['abstract']:
    doc = nlp(record)
    matches = matcher(doc)
    chunked_record = []
    for match_id, start, end in matches:
        span = doc[start:end]
        chunked_record.append(span.text)
    chunking.append(chunked_record)

In [33]:
# Salva i risultati del chunking nel dataframe
syntax_df["shallow_parsing"] = chunking

In [34]:
# Visualizza il dataframe aggiornato
syntax_df.head()

Unnamed: 0,condition,abstract,sentences,words,lemmatization,stemming,pos_tag,shallow_parsing
0,4,Catheterization laboratory events and hospital...,[Catheterization laboratory events and hospita...,"[Catheterization, laboratory, events, and, hos...","[catheterization, laboratory, event, and, hosp...","[catheter, laboratori, event, and, hospit, out...","[NN, NN, NNS, CC, NN, NN, IN, JJ, NN, IN, JJ, ...",[]
1,5,Renal abscess in children. Three cases of rena...,"[Renal abscess in children., Three cases of re...","[Renal, abscess, in, children, ., Three, cases...","[Renal, abscess, in, child, ., three, case, of...","[renal, abscess, in, children, ., three, case,...","[NNP, NN, IN, NNS, ., CD, NNS, IN, JJ, NNS, IN...",[S. aureus]
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,[Hyperplastic polyps seen at sigmoidoscopy are...,"[Hyperplastic, polyps, seen, at, sigmoidoscopy...","[hyperplastic, polyp, see, at, sigmoidoscopy, ...","[hyperplast, polyp, seen, at, sigmoidoscopi, a...","[JJ, NNS, VBN, IN, NN, VBP, NNS, IN, JJ, NN, V...",[]
3,5,Subclavian artery to innominate vein fistula a...,[Subclavian artery to innominate vein fistula ...,"[Subclavian, artery, to, innominate, vein, fis...","[subclavian, artery, to, innominate, vein, fis...","[subclavian, arteri, to, innomin, vein, fistul...","[JJ, NN, TO, VB, JJ, NN, IN, NN, IN, DT, NN, N...","[subclavian vein, subclavian vein]"
4,4,Effect of local inhibition of gamma-aminobutyr...,[Effect of local inhibition of gamma-aminobuty...,"[Effect, of, local, inhibition, of, gamma, -, ...","[effect, of, local, inhibition, of, gamma, -, ...","[effect, of, local, inhibit, of, gamma, -, ami...","[NN, IN, JJ, NN, IN, NN, HYPH, JJ, NN, NN, IN,...",[pmol/250 nl]
