Upload and first look

In [52]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("data/poetry_dataset.csv")

df.info()        
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13854 entries, 0 to 13853
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  13854 non-null  int64 
 1   Title       13854 non-null  object
 2   Poem        13854 non-null  object
 3   Poet        13854 non-null  object
 4   Tags        12899 non-null  object
dtypes: int64(1), object(4)
memory usage: 541.3+ KB


Unnamed: 0.1,Unnamed: 0,Title,Poem,Poet,Tags
0,0,\r\n\r\n Objects Used to Pr...,"\r\n\r\nDog bone, stapler,\r\n\r\ncribbage boa...",Michelle Menting,
1,1,\r\n\r\n The New Church\r\n...,\r\n\r\nThe old cupola glinted above the cloud...,Lucia Cherciu,


In [53]:
sample_idx = [0, 20, 18]                    

for i in sample_idx:
    text = df.loc[i, "Poem"]
    print(f"\nPoem {i}\n{repr(text[:300])}")


Poem 0
'\r\n\r\nDog bone, stapler,\r\n\r\ncribbage board, garlic press\r\n\r\n     because this window is loose—lacks\r\n\r\nsuction, lacks grip.\r\n\r\nBungee cord, bootstrap,\r\n\r\ndog leash, leather belt\r\n\r\n     because this window had sash cords.\r\n\r\nThey frayed. They broke.\r\n\r\nFeather duster, thatch of straw, empty\r\n\r\nbottle '

Poem 20
'\r\n\r\n               I got a call from the White House, from the\r\n\r\nPresident himself, asking me if I’d do him a personal\r\n\r\nfavor. I like the President, so I said, “Sure, Mr.\r\n\r\nPresident, anything you like.” He said, “Just act\r\n\r\nlike nothing’s going on. Act normal. That would\r\n\r\nmean the world to m'

Poem 18
'\r\n\r\nWhy are you still seventeen\r\n\r\nand drifting like a dog after dark,\r\n\r\ndragging a shadow you’ve found?\r\n\r\n \r\n\r\nPut it back where it belongs,\r\n\r\nand that bend of river, too. That’s not the road\r\n\r\nyou want, though you have it to yourself.\r\n\r\n \r\n\r\nGone are the cars 

Cleaning and Preprocessing

In [54]:
df['Poem_clean'] = (
    df['Poem']
      .str.replace(                 #replace \r\n into tokens
          r'(?:\r\n|\r|\n){1,}',
          lambda m: '<SB>' if m.group(0).replace('\r\n', '\n')
                                      .replace('\r', '\n').count('\n') >= 4
                      else '<LB>',
          regex=True)
      .str.replace(r'(?:<SB>\s*)+', '<SB>', regex=True)
      .str.replace(r'^<LB>\s*|\s*<LB>$', '', regex=True)
      .str.strip()
)

In [55]:
df = df.drop(columns=["Unnamed: 0"])    #drop redundant column

In [56]:
print(
    df.sample(5)[['Title','Poet','Poem_clean']]
      .to_string(index=False)
)

                                                                                    Title                Poet                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [57]:
# check for missing poems
missing = df['Poem_clean'].isnull().sum()
print(missing)

# check for duplicates
dupes = df.duplicated(subset=['Title','Poem_clean','Poet']).sum()
print(dupes)

# check for empty strings 
empty = (df['Poem_clean'].str.strip() == "").sum()
print(empty)

# rows that were formerly empty but now just tokens
token_only = df['Poem_clean'].isin(['<LB>', '<SB>'])
print(token_only.sum())
print(df.loc[token_only, ['Poem_clean']].head())

0
8
1
99
     Poem_clean
919        <SB>
945        <SB>
4091       <SB>
4191       <SB>
4193       <SB>


In [58]:
empty0 = (df['Poem'].str.strip() == "").sum()   
print(empty0)

#those cells were empty before cleaning, so we will drop them as well as duplicates and only-tokens

101


In [59]:
df = df[df["Poem_clean"].str.strip().astype(bool)].reset_index(drop=True)

df = df.drop_duplicates(subset=["Title","Poem_clean","Poet"]).reset_index(drop=True)

df = df[~token_only].reset_index(drop=True)

  df = df[~token_only].reset_index(drop=True)


In [None]:
def stats(df):
    # per-poem metrics
    metrics = df['Poem_clean'].apply(lambda text: pd.Series({
        'n_stanzas':            len([s for s in text.split('<SB>') if s.strip()]),
        'n_lines':              sum(len(st.split('<LB>')) for st in text.split('<SB>')),
        'avg_lines_per_stanza': (
            lambda L,S: L/S if S else 0
        )(
            sum(len(st.split('<LB>')) for st in text.split('<SB>')),
            len([s for s in text.split('<SB>') if s.strip()])
        ),
        'avg_words_per_line': (
            lambda wpl: sum(wpl)/len(wpl) if wpl else 0
        )([
            len(line.split())
            for st in text.split('<SB>')
            for line in st.split('<LB>')
            if line.strip()
        ]),
        'total_words':          sum(
            len(line.split())
            for st in text.split('<SB>')
            for line in st.split('<LB>')
            if line.strip()
        )
    }))

    print("Lines & Stanzas")
    print(metrics[['n_lines','n_stanzas','avg_lines_per_stanza']].describe(), "\n")
    print("Words per Line")
    print(metrics['avg_words_per_line'].describe(), "\n")
    print("Total Words per Poem")
    print(metrics['total_words'].describe(), "\n")


stats(df)

Lines & Stanzas
            n_lines     n_stanzas  avg_lines_per_stanza
count  13751.000000  13751.000000          13751.000000
mean      27.230020      1.729692             23.956281
std       49.276962      7.834730             44.168166
min        1.000000      0.000000              0.000000
25%        1.000000      1.000000              1.000000
50%       16.000000      1.000000             15.000000
75%       33.000000      1.000000             30.000000
max     1344.000000    637.000000           1344.000000 

Words per Line
count    13751.000000
mean        71.079127
std        284.936695
min          0.000000
25%          6.062500
50%          8.000000
75%         69.000000
max       9713.000000
Name: avg_words_per_line, dtype: float64 

Total Words per Poem
count    13751.000000
mean       251.321140
std        442.320771
min          0.000000
25%         99.000000
50%        153.000000
75%        263.000000
max      15713.000000
Name: total_words, dtype: float64 



In [64]:
metrics = df['Poem_clean'].apply(lambda text: pd.Series({
    'n_stanzas': len([s for s in text.split('<SB>') if s.strip()]),
    'n_lines':   sum(len(st.split('<LB>')) for st in text.split('<SB>')),
    'avg_words_per_line': (
        lambda wpl: sum(wpl)/len(wpl) if wpl else 0)(
            [len(l.split())
             for st in text.split('<SB>')
             for l in st.split('<LB>')
             if l.strip()]
    )
}))

# select the “long‐line” outliers
outliers = metrics[metrics['avg_words_per_line'] > 50].head(5)

# show 
for idx in outliers.index:
    print(f"— {df.at[idx,'Title']} by {df.at[idx,'Poet']} "
          f"(avg words/line = {outliers.at[idx,'avg_words_per_line']:.1f})")
    print(df.at[idx,'Poem_clean'][:500].replace('\n', '\\n'))
    print("…\n")

— 

                    Invisible Fish

                 by Joy Harjo (avg words/line = 57.0)
Invisible fish swim this ghost ocean now described by waves of sand, by water-worn rock. Soon the fish will learn to walk. Then humans will come ashore and paint dreams on the dying stone. Then later, much later, the ocean floor will be punctuated by Chevy trucks, carrying the dreamers’ decendants, who are going to the store.
…

— 

                    Don’t Bother the Earth Spirit

                 by Joy Harjo (avg words/line = 107.0)
Don’t bother the earth spirit who lives here. She is working on a story. It is the oldest story in the world and it is delicate, changing. If she sees you watching she will invite you in for coffee, give you warm bread, and you will be obligated to stay and listen. But this is no ordinary story. You will have to endure earthquakes, lightning, the deaths of all those you love, the most blinding beauty. It’s a story so compelling you may never want to leave; this

A very long 1-liners is ok, it is just prose poems.

In [65]:
# the poem with the most lines
idx_max_lines = metrics['n_lines'].idxmax()
print("Poem with max lines/stanzas")
print(df.loc[idx_max_lines, ['Title','Poet']])
print(metrics.loc[idx_max_lines, ['n_lines','n_stanzas']])
print("\nSnippet:")
print(df.at[idx_max_lines, 'Poem_clean'][:500], "…\n")

# the poem with the highest words-per-line
idx_max_wpl = metrics['avg_words_per_line'].idxmax()
print("Poem with max avg words-per-line")
print(df.loc[idx_max_wpl, ['Title','Poet']])
print(metrics.loc[idx_max_wpl, ['avg_words_per_line']])
print("\nSnippet:")
print(df.at[idx_max_wpl, 'Poem_clean'][:500], "…")

Poem with max lines/stanzas
Title    \r\n\r\n                    Song of Myself (18...
Poet                                          Walt Whitman
Name: 12981, dtype: object
n_lines      1344.0
n_stanzas       1.0
Name: 12981, dtype: float64

Snippet:
1<LB>I celebrate myself, and sing myself,<LB>And what I assume you shall assume,<LB>For every atom belonging to me as good belongs to you.<LB>I loafe and invite my soul,<LB>I lean and loafe at my ease observing a spear of summer grass.<LB>My tongue, every atom of my blood, form’d from this soil, this air,<LB>Born here of parents born here from parents the same, and their parents the same,<LB>I, now thirty-seven years old in perfect health begin,<LB>Hoping to cease not till death.<LB>Creeds and s …

Poem with max avg words-per-line
Title    \r\n\r\n                    Venus and Adonis\r...
Poet                                   William Shakespeare
Name: 2390, dtype: object
avg_words_per_line    9713.0
Name: 2390, dtype: float64

Snippet:
<L

1. the poem with most lines is ok, it is indeed a very long poem by Walt Whitmen
2. the poem with most words per line is bugged, spaces are not seen as LB or SB here.

In [66]:
import re

space_mask = df['Poem_clean'].str.contains(r' {3,}')

print(f"{space_mask.sum()} poems with runs of ≥3 spaces")

for idx, row in df[space_mask].head(5).iterrows():
    title, poet = row['Title'], row['Poet']
    snippet = row['Poem_clean'][:200].replace('\n', '\\n')
    print(f"— {title} by {poet}")
    print(snippet)
    print("…\n")

5092 poems with runs of ≥3 spaces
— 

                    Objects Used to Prop Open a Window

                 by Michelle Menting
Dog bone, stapler,<LB>cribbage board, garlic press<LB>     because this window is loose—lacks<LB>suction, lacks grip.<LB>Bungee cord, bootstrap,<LB>dog leash, leather belt<LB>     because this window 
…

— 

                    scars

                 by Truong Tran
my father’s body is a map<LB>a record of his journey<LB> <LB>he carries a bullet<LB>lodged in his left thigh<LB>there is a hollow where it entered<LB>a protruding bump where it sleeps<LB>the doctors s
…

— 

                    what remains two

                 by Truong Tran
it has long been forgotten this practice of the mother<LB>weaning a child she crushes the seeds of a green<LB>chili rubs it to her nipple what the child feels<LB>she too will   share in this act   of 
…

— 

                    Yes

                 by Debora Greger
<SB>Yes, your childhood now a legend of fountains<LB>    

Some spaceruns are just spaces, we can neglect them by excluding those poems that have LBs and SBs

In [67]:
import collections

# select poems with NO tokens
mask_no_tokens = ~df['Poem_clean'].str.contains(r'<LB>|<SB>', regex=True)
subset = df.loc[mask_no_tokens].copy()

# per-poem histogram of space-runs (≥ 2 spaces)
subset['run_hist'] = (
    subset['Poem_clean']
      .str.findall(r' {2,}')
      .apply(lambda lst: collections.Counter(len(r) for r in lst))
)

# keep poems that have at least 2 distinct run-lengths
subset_multi = subset[subset['run_hist'].apply(lambda h: len(h) >= 2)].copy()

# save to CSV for mannual check
subset_multi[['Title','Poet','run_hist','Poem_clean']].to_csv(
    "poems_with_multi_space_runs.csv", index=False
)

print("Saved to poems_with_multi_space_runs.csv")

Saved to poems_with_multi_space_runs.csv


Mannual check revealed that there are 617 poems that have no SB and LB and where spaceruns differ from 2 to 256, any patterns are hardly seen. I think it is easier to drop those 166 poems.

In [68]:
import os

# poems that still have NO tokens
mask_no_tokens = ~df['Poem_clean'].str.contains(r'<LB>|<SB>', regex=True)
subset = df.loc[mask_no_tokens].copy()

# histogram of space-runs (≥2 spaces)
subset['run_hist'] = (
    subset['Poem_clean']
      .str.findall(r' {2,}')
      .apply(lambda lst: collections.Counter(len(s) for s in lst))
)

# rows to drop: poems with ≥2 distinct run-lengths
mask_drop = subset['run_hist'].apply(lambda h: len(h) >= 2)
print("Dropping", mask_drop.sum(), "poems")

df_clean = df.drop(index=subset.loc[mask_drop].index).reset_index(drop=True)
print("Remaining poems:", len(df_clean))

os.makedirs("processed", exist_ok=True)
df_clean.to_csv("data/processed/final_clean.csv", index=False)
print("Saved to data/processed/final_clean.csv")

Dropping 617 poems
Remaining poems: 13134
Saved to data/processed/final_clean.csv


In [69]:
df = stats(df)  

Lines & Stanzas
            n_lines     n_stanzas  avg_lines_per_stanza
count  13751.000000  13751.000000          13751.000000
mean      27.230020      1.729692             23.956281
std       49.276962      7.834730             44.168166
min        1.000000      0.000000              0.000000
25%        1.000000      1.000000              1.000000
50%       16.000000      1.000000             15.000000
75%       33.000000      1.000000             30.000000
max     1344.000000    637.000000           1344.000000 

Words per Line
count    13751.000000
mean        71.079127
std        284.936695
min          0.000000
25%          6.062500
50%          8.000000
75%         69.000000
max       9713.000000
Name: avg_words_per_line, dtype: float64 

Total Words per Poem
count    13751.000000
mean       251.321140
std        442.320771
min          0.000000
25%         99.000000
50%        153.000000
75%        263.000000
max      15713.000000
Name: total_words, dtype: float64 



In [10]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0


In [None]:
import pandas as pd

# load
df = pd.read_csv("data/processed/final_clean.csv")

cols = ['Title', 'Poet', 'Tags']

# before cleaning
print("Before cleaning:")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

# cleaning
for col in cols:
    df[col] = (
        df[col]
        # убираем любые переводы строки
        .str.replace(r'[\r\n]+', ' ', regex=True)
        # сводим множественные пробелы/табы в один пробел
        .str.replace(r'[ \t]+',   ' ', regex=True)
        # убираем пробелы/табы по краям
        .str.strip()
    )

print("\nAfter cleaning")
for col in cols:
    has_nl  = df[col].str.contains(r'\n',   regex=True, na=False).sum()
    has_cr  = df[col].str.contains(r'\r',   regex=True, na=False).sum()
    has_ws  = df[col].str.contains(r'^[ \t]+', regex=True, na=False).sum()
    print(f"{col}: \\n={has_nl}, \\r={has_cr}, leading_ws={has_ws}")

df.to_csv("data/processed/final_clean.csv", index=False)

Before cleaning:
Title: \n=13134, \r=13134, leading_ws=0
Poet: \n=0, \r=0, leading_ws=1
Tags: \n=0, \r=0, leading_ws=0

After cleaning
Title: \n=0, \r=0, leading_ws=0
Poet: \n=0, \r=0, leading_ws=0
Tags: \n=0, \r=0, leading_ws=0
