In [4]:
import json
import pandas as pd
import numpy as np
import os
import folium
import googlemaps
import re

In [29]:
# 1. Read the file
with open('CelebrityData.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 2. Split into individual entries
raw_entries = text.split('-----------------END OF ENTRY---------------------------')
records = []

# 3. Parse each entry
for entry in raw_entries:
    entry = entry.strip()
    if not entry:
        continue

    lines = entry.splitlines()
    meta = {}
    content_lines = []
    in_content = False

    for line in lines:
        if in_content:
            content_lines.append(line)
        elif line.startswith('Content:'):
            in_content = True
            # capture same-line content
            after = line[len('Content:'):].strip()
            if after:
                content_lines.append(after)
        else:
            if ': ' in line:
                key, val = line.split(': ', 1)
                meta[key] = val

    # 4. Build the record
    meta['Content'] = "\n".join(content_lines).strip()
    records.append(meta)

# 5. Build the DataFrame
df = pd.DataFrame(records)

# 6a. Parse Published Date with explicit format (bonus)
df['Published Date'] = pd.to_datetime(
    df['Published Date'],
    format='%b %d, %Y, %I:%M %p'
)

# 6b. Clean and convert Sale Price (int) using regex strip (option 1)
#    – remove anything except digits and decimal point
df['Sale Price (int)'] = (
    df['Sale Price (int)']
      .astype(str)
      .str.replace(r'[^0-9.]', '', regex=True)
)

#    – coerce to numeric, turning any remaining bad values into NaN
df['Sale Price (int)'] = pd.to_numeric(df['Sale Price (int)'], errors='coerce')

#  Optional: drop rows where price failed to parse
# df = df.dropna(subset=['Sale Price (int)'])

# Now df is ready:
# Columns: ['URL', 'Title', 'Subhead', 'Authors',
#           'Published Date', 'Market', 'Geographic Market',
#           'Subject Address', 'Sale Price (text)',
#           'Sale Price (int)', 'Content']

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   URL                133 non-null    object        
 1   Title              133 non-null    object        
 2   Subhead            133 non-null    object        
 3   Authors            133 non-null    object        
 4   Published Date     133 non-null    datetime64[ns]
 5   Market             133 non-null    object        
 6   Geographic Market  133 non-null    object        
 7   Subject Address    133 non-null    object        
 8   Sale Price (text)  133 non-null    object        
 9   Sale Price (int)   118 non-null    float64       
 10  Content            134 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(9)
memory usage: 11.6+ KB
None
                                                 URL  \
0  https://therealdeal.com/miami/2025/01/28/sea

In [30]:
df = df.dropna(thresh=3)

In [32]:
df.columns

Index(['URL', 'Title', 'Subhead', 'Authors', 'Published Date', 'Market',
       'Geographic Market', 'Subject Address', 'Sale Price (text)',
       'Sale Price (int)', 'Content'],
      dtype='object')

In [43]:
df = df[~df['Subject Address'].str.contains('The article does not...')]
df = df[~df['Subject Address'].str.contains('The address is not...')]
df = df[~df['Sale Price (text)'].str.contains('The article does not...')]
df = df[~df['Content'].str.contains('The article mentions several properties but does not provide specific addresses')]

In [34]:
df.columns

Index(['URL', 'Title', 'Subhead', 'Authors', 'Published Date', 'Market',
       'Geographic Market', 'Subject Address', 'Sale Price (text)',
       'Sale Price (int)', 'Content'],
      dtype='object')

In [44]:
df.to_csv("Celebrity_Data_Without_Coords_cleaned.csv")

In [45]:
%store -r google_maps_API_Key
gmaps_key = googlemaps.Client(key=google_maps_API_Key)


In [46]:
# Define the geocode function
def geocode(add):
    g = gmaps_key.geocode(add)
    if g:
        lat = g[0]["geometry"]["location"]["lat"]
        lng = g[0]["geometry"]["location"]["lng"]
        return (lat, lng)
    else:
        return None

# Apply geocoding to the 'geo_address' column and store the results in 'geocoded' column
df['geocoded'] = df['Subject Address'].apply(geocode)

In [47]:
df['geocoded'].isna().value_counts()

geocoded
False    132
Name: count, dtype: int64

In [48]:
df['geocoded'] = df['geocoded'].astype(str)
df[['lat', 'lon']] = df['geocoded'].apply(lambda x: (None, None) if x == 'None' else x.strip('()').split(', ', 1)).apply(pd.Series)
df['lat'] = df['lat'].astype(float)
df['lon'] = df['lon'].astype(float)

In [49]:
df = df.dropna(subset='lat')

In [50]:
df.columns

Index(['URL', 'Title', 'Subhead', 'Authors', 'Published Date', 'Market',
       'Geographic Market', 'Subject Address', 'Sale Price (text)',
       'Sale Price (int)', 'Content', 'geocoded', 'lat', 'lon'],
      dtype='object')

In [51]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster, Fullscreen, MiniMap

def create_popup_html(row: pd.Series) -> str:
    """
    Generates an HTML popup for a given DataFrame row.
    
    Parameters:
        row (pd.Series): A row from the DataFrame containing article information.
        
    Returns:
        str: HTML string for the popup.
    """
    title = row.get('Title', 'No Title')
    subhead = row.get('Subhead', 'N/A')
    authors = row.get('Authors', 'Unknown')
    pub_date = row.get('Published Date', 'N/A')
    subject_address = row.get('Subject Address', 'N/A')
    sale_price_text = row.get('Sale Price (text)', 'N/A')
    sale_price_int = row.get('Sale Price (int)', 'N/A')
    url = row.get('URL', '#')
    
    html = f"""
    <div class="popup-content">
        <h4 style="margin-bottom:5px;">{title}</h4>
        <p><strong>{subhead}</strong></p>
        <p><em>{authors}</em></p>
        <p><strong>Published:</strong> {pub_date}</p>
        <p>{subject_address}</p>
        <p><strong>Sale Price:</strong> {sale_price_text}</p>
        <p><a href="{url}" target="_blank">Read more</a></p>
    </div>
    """
    return html

# Assuming df is already defined and contains your data
# Use the first row's latitude and longitude to center the map
first_row = df.iloc[0]
map_center = [first_row['lat'], first_row['lon']]
# Center map on the geographic center of the contiguous US
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4, scrollWheelZoom=False)

# Add a custom Mapbox tile layer (replace with your actual Mapbox access token)
folium.TileLayer(
    tiles='https://api.mapbox.com/styles/v1/mapbox/streets-v11/tiles/256/{z}/{x}/{y}@2x?access_token=pk.eyJ1IjoidHJkZGF0YSIsImEiOiJjamc2bTc2YmUxY2F3MnZxZGh2amR2MTY5In0.QlOWqB-yQNrNlXD0KQ9IvQ',
    attr='Mapbox',
    name='Streets',
    overlay=True,
    control=False,
    show=False,
    min_zoom=1,
    max_zoom=20
).add_to(m)

# Add custom CSS to style the popups
custom_css = """
<style>
    .popup-content {
        min-width: 300px;
        font-size: 14px;
        line-height: 1.4;
        color: #333;
        white-space: normal;
        word-wrap: break-word;
    }
    .leaflet-popup, .leaflet-popup-content-wrapper {
        background-color: #f9f9f9;
        border: 1px solid #bbb;
        border-radius: 5px;
        padding: 8px;
        box-shadow: 0 2px 6px rgba(0,0,0,0.1);
    }
    .leaflet-popup-tip {
        background: #f9f9f9;
    }
</style>
"""
m.get_root().html.add_child(folium.Element(custom_css))

# Add a title to the map
title_html = '''
    <h3 style="text-align:center; font-family:Arial, sans-serif; font-size:18px; color:#333; margin-top:10px;">
        <b>Celebrity Map</b>
    </h3>
'''
m.get_root().html.add_child(folium.Element(title_html))

# Add additional map controls
Fullscreen().add_to(m)
MiniMap(toggle_display=True).add_to(m)

# Create a marker cluster
marker_cluster = MarkerCluster().add_to(m)

# Loop through the DataFrame to add markers
for idx, row in df.iterrows():
    lat = row['lat']
    lon = row['lon']
    popup_html = create_popup_html(row)
    
    folium.Marker(
        location=[lat, lon],
        popup=folium.Popup(popup_html, max_width=300),
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(marker_cluster)

# Display the map
m


In [35]:
m.save('index.html')

In [36]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/'

cwd = os.getcwd()

cwd = cwd.split('/')

final_name = base_name + cwd[-1]
print(final_name)

https://trd-digital.github.io/trd-news-interactive-maps/Celebrity_Data_Map_03_20_24
