In [81]:
import pandas as pd
import spacy
from spacy.tokens import DocBin

# Loading our Data

In [49]:
with open("docs.txt", "r") as f:
    sentences = f.read().splitlines()

In [99]:
sentences

['Steve and Matt are friends.',
 'They both work at Google.',
 'Steve is American and Matt is Canadian.',
 'Steve lives in New York and Matt lives in Toronto.']

# Creating our Pipeline and Docs

In [50]:
nlp = spacy.load("en_core_web_sm")

In [51]:
docs = [nlp(doc) for doc in sentences]

In [52]:
docs

[Steve and Matt are friends.,
 They both work at Google.,
 Steve is American and Matt is Canadian.,
 Steve lives in New York and Matt lives in Toronto.]

# Saving with DocBin

In [86]:
db = DocBin()

In [87]:
for doc in docs:
    db.add(doc)

In [88]:
len(db)

4

In [89]:
db.to_disk("data/db")

In [90]:
new_db = DocBin()

In [92]:
new_db.from_disk("data/db")
len(new_db)

4

In [96]:
new_db

<spacy.tokens._serialize.DocBin at 0x16ccbabf5b0>

In [97]:
new_docs = list(new_db.get_docs(nlp.vocab))

In [98]:
for doc in new_docs:
    print(doc)

Steve and Matt are friends.
They both work at Google.
Steve is American and Matt is Canadian.
Steve lives in New York and Matt lives in Toronto.


In [53]:
df = pd.DataFrame()

In [54]:
df["text"] = sentences

In [55]:
df

Unnamed: 0,text
0,Steve and Matt are friends.
1,They both work at Google.
2,Steve is American and Matt is Canadian.
3,Steve lives in New York and Matt lives in Toro...


In [63]:
all_people = []
for doc in docs:
    people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    all_people.append(people)
all_people

[['Steve', 'Matt'], [], ['Steve', 'Matt'], ['Steve', 'Matt']]

In [64]:
df["people"] = all_people

In [65]:
df

Unnamed: 0,text,people
0,Steve and Matt are friends.,"[Steve, Matt]"
1,They both work at Google.,[]
2,Steve is American and Matt is Canadian.,"[Steve, Matt]"
3,Steve lives in New York and Matt lives in Toro...,"[Steve, Matt]"


In [66]:
df.to_csv("data/bad.csv", index=False)

In [67]:
bad_df = pd.read_csv("data/bad.csv")
bad_df

Unnamed: 0,text,people
0,Steve and Matt are friends.,"['Steve', 'Matt']"
1,They both work at Google.,[]
2,Steve is American and Matt is Canadian.,"['Steve', 'Matt']"
3,Steve lives in New York and Matt lives in Toro...,"['Steve', 'Matt']"


In [68]:
type(bad_df.people[0])

str

In [69]:
df.to_feather("data/good")

In [70]:
good_df = pd.read_feather("data/good")
good_df

Unnamed: 0,text,people
0,Steve and Matt are friends.,"[Steve, Matt]"
1,They both work at Google.,[]
2,Steve is American and Matt is Canadian.,"[Steve, Matt]"
3,Steve lives in New York and Matt lives in Toro...,"[Steve, Matt]"


In [71]:
type(good_df.people[0])

numpy.ndarray

In [72]:
good_df.people[0]

array(['Steve', 'Matt'], dtype=object)

In [73]:
# Pickle

In [74]:
good_df["spacy_docs"] = docs

In [75]:
good_df

Unnamed: 0,text,people,spacy_docs
0,Steve and Matt are friends.,"[Steve, Matt]","(Steve, and, Matt, are, friends, .)"
1,They both work at Google.,[],"(They, both, work, at, Google, .)"
2,Steve is American and Matt is Canadian.,"[Steve, Matt]","(Steve, is, American, and, Matt, is, Canadian, .)"
3,Steve lives in New York and Matt lives in Toro...,"[Steve, Matt]","(Steve, lives, in, New, York, and, Matt, lives..."


In [76]:
good_df.to_pickle("data/pickle")

In [77]:
spacy_df = pd.read_pickle("data/pickle")

In [78]:
spacy_df

Unnamed: 0,text,people,spacy_docs
0,Steve and Matt are friends.,"[Steve, Matt]","(Steve, and, Matt, are, friends, .)"
1,They both work at Google.,[],"(They, both, work, at, Google, .)"
2,Steve is American and Matt is Canadian.,"[Steve, Matt]","(Steve, is, American, and, Matt, is, Canadian, .)"
3,Steve lives in New York and Matt lives in Toro...,"[Steve, Matt]","(Steve, lives, in, New, York, and, Matt, lives..."


In [47]:
type(spacy_df["spacy_docs"][0])

spacy.tokens.doc.Doc