In [1]:
import pandas as pd
import re

In [2]:
file_path = r"C:\Users\yaobv\chatgpt\hanson_again_04_18.txt"

In [3]:
# loading the file
text_file = 'hanson_again_04_18.txt'

with open(file_path, 'r', encoding='utf-8') as hanson:
    corpus = hanson.read()

In [4]:
# splitting on text i inserted to standardize the 
# the splits
split_corpus = corpus.split('XXXTHIS IS A NEW ARTICLEXXX')

In [5]:
# okay, most urls returned this time i scraped the page
# fortunately, all of the posts in no_subscribe are 
# eliezer yudkowsky posts - will clean separately

no_subscribe = [x for x in split_corpus if 'Subscribe' not in x]

In [30]:
# splitting on subscribe to make cleaning the data tractable
split_subscribe = [x.split('Subscribe') for x in split_corpus if 'Subscribe' in x]

In [31]:
# getting titles and authors from the 'split on subscribe' list
all_titles = [s[0].replace('\n','') for s in split_subscribe]
cleaned_titles = [x.replace('Overcoming Bias', '').replace('- ', '').strip() for x in all_titles]
cleaned_titles = [x for x in cleaned_titles if 'Epilogue:' not in x]

titles = []
authors = []

for title in cleaned_titles:
    if 'by ' in title:
        split_title = title.split('by ')
        title, author = split_title[0].strip(), split_title[1].strip()
        titles.append(title)
        authors.append(author)
    else:
        titles.append(title)
        authors.append('N/A')

In [32]:
# creating the initial dataframe with titles and authors
ta_df = pd.DataFrame({'title' : titles, 'author' : authors})

In [33]:
def find_lower_case(url):
    """
    a function to find all strings where a lower case char is followed
    by an upper case char without a space in between
    """
    pattern = r'[a-z][A-Z]'
    
    # search for pattern in the text
    match = re.search(pattern, url)
    
    if match:
        return match.start()
    else:
        print('error')
        return 'error'

In [34]:
def get_url(url):
    """
    a function to find the idx of where a 'html' string begins or, if there isnt one,
    to find where a lower case char is followed by an upper case char
    """
    
    if 'html' in url:
        idx = url.rfind('html')
        return url[:idx+4]
    else:
        x = find_lower_case(url)
        
        if x == 'error':
            return 'error'
        else:
            return url[:x+1]

In [35]:
def get_title(url):
    
    if 'html' in url:
        idx = url.rfind('html')    
        return url[idx+4:]
    else:
        x = find_lower_case(url)
        
        if x == 'error':
            return 'error'
        else:
            return url[x+1:]

In [36]:
# extracting links and titles 
urls = ta_df['title'].apply(get_url)
titles = ta_df['title'].apply(get_title)

ta_df['url'] = urls
ta_df['title'] = titles

error
error


In [71]:
# getting the body from each post using a string indicator of which
# element in the split-on-subscribe has the main content
# there is one error.

bodies = []

for i, s in enumerate(split_subscribe):
    
    try:
        if 'readingSign in' in s[2]:
            bodies.append(s[2])
    
        elif 'to Overcoming BiasBy' in s[2]:
            bodies.append(s[1])
        else:
            bodies.append(s[1])
    
    except:
        print("ERROR", i)
        bodies.append('ERROR')    

ERROR 1880


In [73]:
ta_df['body'] = bodies

In [74]:
def fix_body(text):
    "various string funcs and reg ex operations to replace strings"
    
    fixed = text.replace('ShareShare this postStatus Honestywww.overcomingbias.comCopy linkTwitterFacebookEmail', '')
    fixed = fixed.replace('ShareShare this post', ' ')
    fixed = fixed.replace('Continue readingSign in', '')
    
    # using regex to replace url
    pattern = r"www\.overcomingbias\.com"
    fixed = re.sub(pattern, " ", fixed)
    
    fixed = fixed.replace('\xa0', ' ')
    fixed = fixed.replace('Copy linkTwitterFacebookEmail', ' ')
    fixed = fixed.replace('Sign inShare this post', ' ')
    
    return fixed.strip()

In [75]:
ta_df['body'] = ta_df['body'].apply(fix_body)

In [80]:
def get_date(text):
    "a function to get the date from the body"
    date = re.findall(r'[A-Za-z]{3} \d{1,2}, \d{4}', text)
    
    if date:
        return date[0]
    else:
        return 'none'

In [84]:
# extracting the date from the body
ta_df['date_string'] = ta_df['body'].apply(get_date)

# dropping rows without a date
ta_df = ta_df[ta_df['date_string'] != 'none'].copy()

# converting the weirdly formatted date to pandas datetime
ta_df['date'] = ta_df['date_string'].apply(lambda x: pd.to_datetime(x, format='%b %d, %Y'))

# sorting the new dataframe and resetting the index
ta_df.sort_values(by='date', inplace=True)
ta_df.reset_index(drop=True, inplace=True)

In [85]:
# splitting on the date to get rid of it and clean up the body further
new_bodies_date = []

for date, body in zip(ta_df['date_string'], ta_df['body']):
    
    body_split = body.split(date)
    new_bodies_date.append(body_split[1])

ta_df['body'] = new_bodies_date

In [86]:
# reducing to the wanted cols and ordering them
ta_df = ta_df[['date', 'author', 'title', 'body', 'url']].copy()

In [90]:
# saving the dataframe to csv. come back to this to
# get those entries w/o 'Subscribe'
ta_df.to_csv('hanson_df.csv', index=False)