In [6]:

# ! conda install numpy pandas tensorflow transformers nltk 
# ! conda install requests beautifulsoup4 pandas 

# Log of my thought process
- First I was deciding between which kinda of poem generator i want to use - Template based, Markov Chain, Dust House etc.
- Decided that those mentioned were not interesting enough
- Chose neural networks
- It was time to decide what kind of poems to use as inspiration
- Remembered Charles Bukowski because of his strong and recognizable style of writing - struggle of living life
- scraped Poem Hunter - 151 Bukowski poems - not enough (imo) so scraped some more similar authors (Allen Ginsberg, Jack Kerouac, Lawrence Felinghetti and Sylvia Plath)
- joined scraped data and cleaned it up
- created tokenizer and fined tuned gpt2
- then for poem generation i played around few parameters - especially giving model freedom of how long poem can be to not have abrupt end. Seed initialiation, temperature etc
- Also seed text has big influence - can inflence the stylistical style a lot 
- poem make sense but have no sentence stoppers
- as for evaluation i fed back poems to Chatgpt to try to guess who is the author - usually at least one of the chosen authors sometimes 3/5 or even 4/5.
- sometimes guessed some other authors as well: William S. Burroughs, T.S. Eliot occuring a lot

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import nltk
nltk.download('punkt')  # Download data for NLTK tokenization


import requests
from bs4 import BeautifulSoup
import re



[nltk_data] Downloading package punkt to /Users/vanbuncha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Scraping poem authors
Charles Bukowski as main focus
and another 4 authors - Allen Ginsberg, Jack Kerouac, Lawrence Ferlinghetti, Sylvia Plath


In [8]:
# Dictionary of poets and their base URLs with the number of pages to scrape
poets = {
    'charles_bukowski': ("https://www.poemhunter.com/charles-bukowski/poems/page-{}/", 16),
    'allen_ginsberg': ("https://www.poemhunter.com/allen-ginsberg/poems/page-{}/", 5),  
    'jack_kerouac': ("https://www.poemhunter.com/jack-kerouac/poems/page-{}/", 3),     
    'lawrence_ferlinghetti': ("https://www.poemhunter.com/lawrence-ferlinghetti/poems/page-{}/", 4),
    'sylvia_plath': ("https://www.poemhunter.com/sylvia-plath/poems/page-{}/", 25)
}

# Loop through each poet in the dictionary
for poet, (base_url, num_pages) in poets.items():
    print(f"Scraping poems by {poet.replace('_', ' ').title()}...")
    
    # lists to store titles and links
    all_poem_titles = []
    all_poem_links = []

    # loop through all pages
    for page_number in range(1, num_pages + 1):
        print(f"Scraping page {page_number} of {num_pages} for {poet.replace('_', ' ').title()}...")


        page_url = base_url.format(page_number)
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')


        poem_elements = soup.find_all('div', class_='phlText')

        # if no poem elements are found, we've reached the last page
        if not poem_elements:
            print(f"No poems found on page {page_number}. Ending scrape.")
            break

        # extract links and titles for each poem on the current page
        for poem_element in poem_elements:
            link = poem_element.find('a')
            if link:
                all_poem_titles.append(link.text.strip())  # Store poem title
                all_poem_links.append("https://www.poemhunter.com" + link['href'])  # Store complete poem URL

        # add a short delay to avoid overloading the server
        time.sleep(1)

    # visit each poem page
    all_poem_texts = []

    for idx, poem_link in enumerate(all_poem_links):
        poem_response = requests.get(poem_link)
        poem_soup = BeautifulSoup(poem_response.text, 'html.parser')

        # extract the poem content from the specific div
        poem_div = poem_soup.find('div', class_='phContent phcText')
        if poem_div:
            poem_text = poem_div.get_text(separator="\n", strip=True) 
            all_poem_texts.append(poem_text)
        else:
            all_poem_texts.append("")  # If no poem text is found, append an empty string

        # Print progress
        print(f"Scraped {idx + 1}/{len(all_poem_links)}: {all_poem_titles[idx]}")

        # add a short delay to avoid overwhelming the server
        time.sleep(1)

    # save all collected poems to a CSV file
    poems_df = pd.DataFrame({'Title': all_poem_titles, 'Content': all_poem_texts})
    csv_filename = f'{poet}_poems.csv'
    poems_df.to_csv(csv_filename, index=False, encoding='utf-8')

    print(f"Scraping completed for {poet.replace('_', ' ').title()}! Poems saved to '{csv_filename}'.")

print("All poets have been scraped successfully!")


Scraping poems by Charles Bukowski...
Scraping page 1 of 16 for Charles Bukowski...
Scraping page 2 of 16 for Charles Bukowski...
Scraping page 3 of 16 for Charles Bukowski...
Scraping page 4 of 16 for Charles Bukowski...
Scraping page 5 of 16 for Charles Bukowski...
Scraping page 6 of 16 for Charles Bukowski...
Scraping page 7 of 16 for Charles Bukowski...
Scraping page 8 of 16 for Charles Bukowski...
Scraping page 9 of 16 for Charles Bukowski...
Scraping page 10 of 16 for Charles Bukowski...
Scraping page 11 of 16 for Charles Bukowski...
Scraping page 12 of 16 for Charles Bukowski...
Scraping page 13 of 16 for Charles Bukowski...
Scraping page 14 of 16 for Charles Bukowski...
Scraping page 15 of 16 for Charles Bukowski...
Scraping page 16 of 16 for Charles Bukowski...
Scraped 1/151: A Smile To Remember
Scraped 2/151: Alone With Everybody
Scraped 3/151: An Almost Made Up Poem
Scraped 4/151: And The Moon And The Stars And The World
Scraped 5/151: Bluebird
Scraped 6/151: A Challenge To 

# Joing the csv files

In [9]:
# Load all CSV files
bukowski = pd.read_csv('charles_bukowski_poems.csv')
ginsberg = pd.read_csv('allen_ginsberg_poems.csv')
kerouac = pd.read_csv('jack_kerouac_poems.csv')
ferlinghetti = pd.read_csv('lawrence_ferlinghetti_poems.csv')
plath = pd.read_csv('sylvia_plath_poems.csv')

# Combine the data into a single DataFrame
all_poems_df = pd.concat([bukowski, ginsberg, kerouac, ferlinghetti, plath], ignore_index=True)

# Basic text cleaning
def clean_poem(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text

# Apply the cleaning function to the 'Content' column
all_poems_df['Cleaned_Content'] = all_poems_df['Content'].apply(clean_poem)

# Save the cleaned dataset
all_poems_df.to_csv('all_poets_cleaned_poems.csv', index=False, encoding='utf-8')
print("Combined and cleaned dataset saved as 'all_poets_cleaned_poems.csv'.")


Combined and cleaned dataset saved as 'all_poets_cleaned_poems.csv'.
