In [1]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import random
import pickle

from spacy import displacy
from spacy.matcher import Matcher

In [12]:
#instantiate an empty nlp with no pipeline components
nlp = spacy.blank('en')

In [13]:
#check if it's indeed empty
#and it is
nlp.pipeline

[]

In [14]:
df = pd.read_csv('data/all_scripts.csv')

In [15]:
#slice examples with words to learn only (label=1)
df1 = df.loc[df['label']==1]

In [16]:
df1.reset_index(drop=True, inplace=True)

In [17]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   208 non-null    int64 
 1   script  208 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.4+ KB


In [18]:
#and naturally there is no entity to render
displacy.render(nlp(df1['script'][0]), style='ent')



## Create Match Patterns

In [3]:
nlp = spacy.load('en_core_web_sm')

In [20]:
#type 1. tournament titles
fedcup_pattern = [{'IS_TITLE': True},{'LOWER': 'cup'}]
grand_slam_pattern = [{'LOWER': 'grand'}, {'LOWER': {'IN': ['slam', 'slams']}}]
tournament_pattern = [{'IS_ALPHA': True, 'POS': 'PROPN'}, {'LOWER': 'open'}]

#type 2. player names
first_name_pattern = [{'POS': 'PROPN', 'IS_TITLE': True}]
full_name_pattern = [{'IS_TITLE': True}, {'IS_TITLE': True}]

#type 3. tennis terms
double_fault_pattern = [{'LOWER': 'double'}, {'LEMMA': 'fault'}]
rally_pattern = [{'LEMMA': 'rally'}]
set_pattern = [{'POS': 'ADJ'}, {'LOWER': 'set'}]
dropshot_pattern = [{'LOWER': 'dropshot'}]
serve_pattern = [{'LEMMA': 'serve'}]
timeout_pattern = [{'LOWER': 'medical'}, {'LOWER': 'timeout'}]
break_pattern = [{'LEMMA': 'break'}]
round_robin_pattern = [{'LOWER': 'round'}, {'LEMMA': 'robin'}]
ace_pattern = [{'LEMMA': {'IN': ['ace', 'volley', 'dropshot']}}]
winner_pattern = [{'LEMMA': 'winner'}]
break_point_pattern = [{'LOWER': 'break'}, {'LEMMA': 'point'}]
three_setter_pattern = [{'POS': 'NUM'}, {'IS_PUNCT': True}, {'LEMMA': 'set'}]

#type 4. contextual words
agressive_pattern = [{'LEMMA': 'aggressive'}]

#trial
stroke_pattern = [{'TEXT': {"REGEX": "\w*hand?"}}]

In [21]:
#instantiate a matcher
matcher = Matcher(nlp.vocab, validate=True)

In [22]:
#add the matche patterns
#try just one match for now
matcher.add('网球比赛名称', [fedcup_pattern, grand_slam_pattern, tournament_pattern])
# matcher.add('网球术语', [double_fault_pattern,
#                         rally_pattern, set_pattern, 
#                         dropshot_pattern, 
#                         serve_pattern, 
#                         timeout_pattern, 
#                         break_pattern, 
#                         round_robin_pattern, 
#                         ace_pattern, winner_pattern, 
#                         break_point_pattern, 
#                         three_setter_pattern])
# matcher.add('球员名字', [first_name_pattern, full_name_pattern])
# matcher.add('打得很凶', [agressive_pattern])

## Generate training data

In [23]:
def generate_train_data(text):
    doc = nlp(text)
    matches = matcher(doc)
    detection = [(doc[start:end].start_char, doc[start:end].end_char, nlp.vocab.strings[idx]) for idx, start, end in matches]
    return (doc.text, {'entities': detection})

In [24]:
train = [generate_train_data(t) for t in df1['script'] if len(matcher(nlp(t))) > 0]

In [43]:
with open('data/train_data', 'wb') as file: 
    pickle.dump(train, file)