In [1]:
import spacy
import pandas as pd

from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span

In [2]:
#read in the labeled dataset
df = pd.read_csv('data/all_scripts.csv')

In [5]:
#slice examples with words to learn only (label=1)
df1 = df.loc[df['label']==1]

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 5 to 1902
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   210 non-null    int64 
 1   script  210 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.9+ KB


## Call SpaCy

In [8]:
#instantiate an empty nlp with no pipeline components
nlp = spacy.blank('en')

In [12]:
#check if it's indeed empty
#and it is
nlp.pipeline

[]

In [9]:
#and naturally there is no entity to render
displacy.render(df1['script'][6:10].apply(nlp), style='ent')



## Create Match Patterns

In [14]:
#type 1. tournament titles
fedcup_pattern = [{'IS_TITLE': True},{'LOWER': 'cup'}]
grand_slam_pattern = [{'LOWER': 'grand'}, {'LOWER': {'IN': ['slam', 'slams']}}]
tournament_pattern = [{'IS_ALPHA': True, 'POS': 'PROPN'}, {'LOWER': 'open'}]

#type 2. player names
first_name_pattern = [{'POS': 'PROPN', 'IS_TITLE': True}]
full_name_pattern = [{'IS_TITLE': True}, {'IS_TITLE': True}]

#type 3. tennis terms
double_fault_pattern = [{'LOWER': 'double'}, {'LEMMA': 'fault'}]
rally_pattern = [{'LEMMA': 'rally'}]
set_pattern = [{'POS': 'ADJ'}, {'LOWER': 'set'}]
#dropshot_pattern = [{'LOWER': 'dropshot'}]
serve_pattern = [{'LEMMA': 'serve'}]
timeout_pattern = [{'LOWER': 'medical'}, {'LOWER': 'timeout'}]
break_pattern = [{'LEMMA': 'break'}]
round_robin_pattern = [{'LOWER': 'round'}, {'LEMMA': 'robin'}]
ace_pattern = [{'LEMMA': {'IN': ['ace', 'volley', 'dropshot']}}]
winner_pattern = [{'LEMMA': 'winner'}]
break_point_pattern = [{'LOWER': 'break'}, {'LEMMA': 'point'}]
three_setter_pattern = [{'POS': 'NUM'}, {'IS_PUNCT': True}, {'LEMMA': 'set'}]

#type 4. contextual words
agressive_pattern = [{'LEMMA': 'aggressive'}]

#trial
stroke_pattern = [{'TEXT': {"REGEX": "\w*hand?"}}]

In [16]:
#instantiate a matcher
matcher = Matcher(nlp.vocab, validate=True)

In [17]:
#add the matche patterns
matcher.add('网球比赛名称', [fedcup_pattern, grand_slam_pattern, tournament_pattern])
# matcher.add('打得很凶', [agressive_pattern])
# matcher.add('球员名字', [proper_name_pattern])
# matcher.add('回合', [rally_pattern])
# matcher.add('双误', [doublefault_pattern])

In [30]:
#有match的
#code credit: https://stackoverflow.com/questions/51037383/how-do-i-add-matches-as-entities-and-visualize-it-in-spacy
for sent in sent_list:
    doc = nlp(sent)
    if matcher(doc):
        span = [Span(doc, start, end, label=match_id) for match_id, start, end in matcher(doc)]
        doc.ents = span
    #         doc.ents = list(doc.ents) + [span]
        #print([(ent.text, ent.label_) for ent in doc.ents])
        displacy.render(doc, style='ent')

ValueError: [E1010] Unable to set entity information for token 39 which is included in more than one span in entities, blocked, missing or outside.

In [13]:
#实验二
test = """
So I was a little bit trying to get the rhythm to play a little bit more aggressive, 
maybe a little bit longer rallies, that I can go to the game and be able to play.
"""
test_doc = nlp(test)
matches = matcher(test_doc)

In [27]:
span = [Span(test_doc, start, end, label=match_id) for match_id, start, end in matches]

In [28]:
test_doc.ents = span

In [29]:
displacy.render(test_doc, style='ent')

In [None]:
from spacy.tokens.span_group import SpanGroup

In [None]:
sg = SpanGroup(all_doc)

In [None]:
type(sg)

In [None]:
sg.

In [None]:
for sent in sent_list:
    doc = nlp(sent)
    if matcher(doc):
        for match_id, start, end in matcher(doc):
            span = Span(doc, start, end, label=match_id)
            doc.ents = [span]
        displacy.render(doc, style='ent')

In [None]:
all_para = ' '.join(read_docx('scripts/script02.docx'))

In [None]:
all_doc = nlp(all_para)

In [None]:
for match in matcher(all_doc):
    span = Span(doc, start, end, label=match_id)
    sg.append(span)
#     for match_id, start, end in match:
#         span = Span(doc, start, end, label=match_id)
#         print(span)

# all_match = matcher(all_doc)

In [None]:
#没有match
for sent in sent_list:
    if not matcher(nlp(sent)):
        print(sent)