In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np

In [2]:
# load pickles
with open('clean_df.pkl', 'rb') as f:
    clean_poem_df = pickle.load(f)
    
with open('clean_line_df.pkl', 'rb') as g:
    clean_line_df = pickle.load(g)

In [3]:
clean_poem_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15157 entries, 1 to 15651
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   author             15157 non-null  category
 1   title              15157 non-null  object  
 2   content            15157 non-null  object  
 3   line               15157 non-null  object  
 4   length_in_lines    15157 non-null  int64   
 5   lexical_diversity  15157 non-null  float64 
dtypes: category(1), float64(1), int64(1), object(3)
memory usage: 765.9+ KB


In [4]:
clean_line_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 485604 entries, 1 to 15651
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   author             485604 non-null  category
 1   title              485604 non-null  object  
 2   line               485604 non-null  object  
 3   words              485604 non-null  object  
 4   length_in_words    485604 non-null  int64   
 5   lexical_diversity  485604 non-null  float64 
dtypes: category(1), float64(1), int64(1), object(3)
memory usage: 23.2+ MB


In [19]:
# initialize pipeline for tagging
import spacy
tag_nlp = spacy.load("en_core_web_sm", exclude=["parser", "attribute_ruler", "lemmatizer", "ner"])

In [20]:
tag_nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x19516779220>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1951677dcc0>)]

In [23]:
# define tag pipe function
from tqdm import tqdm
def tag_pipe(texts):
    total = len(texts)
    pipe_out = []
    with tqdm(total=total) as pbar:
        for doc in tqdm(tag_nlp.pipe(texts)):
            pipe_out.append([str(token.tag_) for token in doc])
    return pipe_out

In [15]:
# populate part of speech tag column for lines
clean_line_df['pos_tags'] = tag_pipe(clean_line_df.line)
clean_line_df.head()

Unnamed: 0,author,title,line,words,length_in_words,lexical_diversity,pos_tags
1,Hailey Leithauser,0,philosophic,[philosophic],1,1.0,[JJ]
1,Hailey Leithauser,0,in its complex ovoid emptiness,"[in, its, complex, ovoid, emptiness]",5,1.0,"[IN, PRP$, JJ, JJ, NN]"
1,Hailey Leithauser,0,a skillful pundit coined it as a sort,"[a, skillful, pundit, coined, it, as, a, sort]",8,0.875,"[DT, JJ, NN, VBD, PRP, IN, DT, NN]"
1,Hailey Leithauser,0,of stopgap doorstop for those,"[of, stopgap, doorstop, for, those]",5,1.0,"[IN, NN, VBP, IN, DT]"
1,Hailey Leithauser,0,quaint equations,"[quaint, equations]",2,1.0,"[NN, NNS]"


In [26]:
tagged_lines_df = clean_line_df.drop(['line'], axis=1)
with open('tagged_lines_df.pkl', 'wb') as t:
    pickle.dump(tagged_lines_df, t)

In [24]:
# populate part of speech tag column for lines
clean_poem_df['pos_tags'] = tag_pipe(clean_poem_df.content)

  0%|          | 0/15157 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:01,  1.95s/it][A
257it [00:03, 81.80it/s][A
513it [00:05, 108.07it/s][A
769it [00:07, 118.20it/s][A
1025it [00:09, 118.33it/s][A
1281it [00:11, 120.31it/s][A
1537it [00:13, 121.62it/s][A
1793it [00:15, 121.30it/s][A
2049it [00:18, 116.95it/s][A
2305it [00:20, 117.06it/s][A
2561it [00:22, 117.58it/s][A
2817it [00:24, 120.95it/s][A
3073it [00:26, 120.86it/s][A
3329it [00:28, 119.68it/s][A
3585it [00:30, 121.22it/s][A
3841it [00:33, 120.39it/s][A
4097it [00:34, 128.25it/s][A
4353it [00:36, 128.00it/s][A
4609it [00:39, 117.34it/s][A
4865it [00:41, 117.89it/s][A
5121it [00:43, 122.53it/s][A
5377it [00:45, 120.07it/s][A
5633it [00:47, 121.76it/s][A
5889it [00:49, 126.09it/s][A
6145it [00:51, 127.10it/s][A
6401it [00:53, 125.29it/s][A
6657it [00:55, 121.62it/s][A
6913it [00:57, 122.85it/s][A
7169it [00:59, 123.70it/s][A
7425it [01:02, 122.87it/s][A
7681it [01:04, 119.19it/s][A
7937it [01:06,

In [25]:
clean_poem_df.head()

Unnamed: 0,author,title,content,line,length_in_lines,lexical_diversity,pos_tags
1,Hailey Leithauser,0,"Philosophic\nin its complex, ovoid emptiness,\...","[philosophic, in its complex ovoid emptiness, ...",15,0.863636,"[JJ, NN, IN, PRP$, JJ, ,, JJ, NN, ,, FW, DT, J..."
2,Jody Gladding,1-800-FEAR,We'd like to talk with you about fear t...,[wed like to talk with you about fear they sai...,11,0.663717,"[PRP, VBD, NNP, IN, NNP, IN, NNP, NN, NNP, IN,..."
3,Joseph Brodsky,1 January 1965,The Wise Men will unlearn your name.\nAbove yo...,"[the wise men will unlearn your name, above yo...",24,0.693333,"[DT, NNP, NNPS, MD, VB, PRP$, NN, ., '', IN, P..."
4,Ted Berrigan,3 Pages,For Jack Collom\n10 Things I do Every Day\n\np...,"[for jack collom, things i do every day, play ...",26,0.841463,"[IN, NNP, NNP, :, CD, NNS, PRP, VBP, DT, NN, N..."
5,Joe Brainard,30 One-Liners,WINTER\nMore time is spent at the window.\n\nS...,"[winter, more time is spent at the window, sum...",65,0.575843,"[NN, NN, JJR, NN, VBZ, VBN, IN, DT, NN, ., ADD..."


In [27]:
tagged_poems_df = clean_poem_df.drop(['line'], axis=1)
with open('tagged_poems_df.pkl', 'wb') as t2:
    pickle.dump(tagged_poems_df, t2)