In [1]:
import json
import pandas as pd
import numpy as np
import os
import folium
import googlemaps
import re

In [2]:
# 1. Read the file
with open('CelebrityData.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 2. Split into individual entries
raw_entries = text.split('-----------------END OF ENTRY---------------------------')
records = []

# 3. Parse each entry
for entry in raw_entries:
    entry = entry.strip()
    if not entry:
        continue

    lines = entry.splitlines()
    meta = {}
    content_lines = []
    in_content = False

    for line in lines:
        if in_content:
            content_lines.append(line)
        elif line.startswith('Content:'):
            in_content = True
            # capture same-line content
            after = line[len('Content:'):].strip()
            if after:
                content_lines.append(after)
        else:
            if ': ' in line:
                key, val = line.split(': ', 1)
                meta[key] = val

    # 4. Build the record
    meta['Content'] = "\n".join(content_lines).strip()
    records.append(meta)

# 5. Build the DataFrame
df = pd.DataFrame(records)

# 6a. Parse Published Date with explicit format (bonus)
df['Published Date'] = pd.to_datetime(
    df['Published Date'],
    format='%b %d, %Y, %I:%M %p'
)

# 6b. Clean and convert Sale Price (int) using regex strip (option 1)
#    – remove anything except digits and decimal point
df['Sale Price (int)'] = (
    df['Sale Price (int)']
      .astype(str)
      .str.replace(r'[^0-9.]', '', regex=True)
)

#    – coerce to numeric, turning any remaining bad values into NaN
df['Sale Price (int)'] = pd.to_numeric(df['Sale Price (int)'], errors='coerce')

# 1. Ensure exactly one space after "By"
df['Authors'] = df['Authors'].str.replace(
    r'\bBy\s*',      # the word "By" plus any amount of whitespace
    'By ',           # replace with "By" + single space
    regex=True
)

df['Authors'] = df['Authors'].str.replace(
    r'\bby\s*',      # the word "By" plus any amount of whitespace
    'by ',           # replace with "By" + single space
    regex=True
)

# 2. Ensure there’s a space before the word “Research”
df['Authors'] = df['Authors'].str.replace(
    r'(?<!\s)(Research\b)',  # "Research" not already preceded by whitespace
    r' \1',                  # insert a space before it
    regex=True
)

# 1) insert a space between a lowercase letter and a following uppercase letter
df['Authors'] = df['Authors'].str.replace(
    r'(?<=[a-z])(?=[A-Z])',  # position preceded by [a-z], followed by [A-Z]
    ' ',                     # insert a single space
    regex=True
)

# 2) collapse multiple spaces into one (and trim ends)
df['Authors'] = (
    df['Authors']
      .str.replace(r'\s+', ' ', regex=True)
      .str.strip()
)


df['Subject Address'] = df['Subject Address'] \
    .str.replace(r'\[.*?\]', '', regex=True).str.strip()


print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   URL                133 non-null    object        
 1   Title              133 non-null    object        
 2   Subhead            133 non-null    object        
 3   Authors            133 non-null    object        
 4   Published Date     133 non-null    datetime64[ns]
 5   Market             133 non-null    object        
 6   Geographic Market  133 non-null    object        
 7   Subject Address    133 non-null    object        
 8   Sale Price (text)  133 non-null    object        
 9   Sale Price (int)   118 non-null    float64       
 10  Content            134 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(9)
memory usage: 11.6+ KB
None
                                                 URL  \
0  https://therealdeal.com/miami/2025/01/28/sea

In [3]:
df = df.dropna(thresh=3)

In [4]:
df.columns

Index(['URL', 'Title', 'Subhead', 'Authors', 'Published Date', 'Market',
       'Geographic Market', 'Subject Address', 'Sale Price (text)',
       'Sale Price (int)', 'Content'],
      dtype='object')

In [5]:
df = df[~df['Subject Address'].str.contains('The article does not...')]
df = df[~df['Subject Address'].str.contains('The address is not...')]
df = df[~df['Subject Address'].str.contains('The article mentions two properties related')]
df = df[~df['Subject Address'].str.contains("The article doesn't provide a specific address,")]
df = df[~df['Subject Address'].str.contains("Malibu trophy estate")]
df = df[~df['Subject Address'].str.contains('The sale price of the main property discussed in the article is close to the last asking price of')]
df = df[~df['Sale Price (text)'].str.contains('The article does not...')]
df = df[~df['Sale Price (text)'].str.contains('The sale price is not explicitly stated in the')]
df = df[~df['Subject Address'].str.contains('The article mentions several properties but does not provide specific addresses')]

In [6]:
df.columns

Index(['URL', 'Title', 'Subhead', 'Authors', 'Published Date', 'Market',
       'Geographic Market', 'Subject Address', 'Sale Price (text)',
       'Sale Price (int)', 'Content'],
      dtype='object')

In [7]:
df.to_csv("Celebrity_Data_Without_Coords_cleaned.csv")

In [8]:
%store -r google_maps_API_Key
gmaps_key = googlemaps.Client(key=google_maps_API_Key)


In [9]:
# Define the geocode function
def geocode(add):
    g = gmaps_key.geocode(add)
    if g:
        lat = g[0]["geometry"]["location"]["lat"]
        lng = g[0]["geometry"]["location"]["lng"]
        return (lat, lng)
    else:
        return None

# Apply geocoding to the 'geo_address' column and store the results in 'geocoded' column
df['geocoded'] = df['Subject Address'].apply(geocode)

In [10]:
df['geocoded'].isna().value_counts()

geocoded
False    127
True       1
Name: count, dtype: int64

In [11]:
df['geocoded'] = df['geocoded'].astype(str)
df[['lat', 'lon']] = df['geocoded'].apply(lambda x: (None, None) if x == 'None' else x.strip('()').split(', ', 1)).apply(pd.Series)
df['lat'] = df['lat'].astype(float)
df['lon'] = df['lon'].astype(float)

In [12]:
df = df.dropna(subset='lat')

In [13]:
df.columns

Index(['URL', 'Title', 'Subhead', 'Authors', 'Published Date', 'Market',
       'Geographic Market', 'Subject Address', 'Sale Price (text)',
       'Sale Price (int)', 'Content', 'geocoded', 'lat', 'lon'],
      dtype='object')

In [14]:
df

Unnamed: 0,URL,Title,Subhead,Authors,Published Date,Market,Geographic Market,Subject Address,Sale Price (text),Sale Price (int),Content,geocoded,lat,lon
0,https://therealdeal.com/miami/2025/01/28/sean-...,Shopping spree? Sean Hannity drops $15M on oce...,It’s also adjacent to unit Fox News host bough...,By Kate Hinsche,2025-01-28 17:00:00,Celebrity Real Estate,miami,"10 Sloans Curve Drive, Palm Beach, FL",$14.9 million,14900000.0,Fox News star Sean Hannity bought an oceanfron...,"(26.6459222, -80.0379056)",26.645922,-80.037906
1,https://therealdeal.com/new-york/tristate/2024...,Billy Joel bags $11M East Hampton equestrian e...,Crooner purchased Hamptons home for $11M,By TRD Staff,2024-05-20 13:45:00,Celebrity Real Estate,new-york,"143 Town Lane, East Hampton., NY",$10.7 million,10700000.0,Billy Joel rounded up another equestrian estat...,"(40.9769341, -72.16425690000001)",40.976934,-72.164257
2,https://therealdeal.com/la/2025/02/07/keeping-...,Kris Jenner asks $14M for Kardashian estate,Family used the Hidden Hills home to film “Kee...,By Kari Hamanaka,2025-02-07 15:11:00,Celebrity Real Estate,la,"25115 Eldorado Meadow Road, Calabasas, CA",$13.5 million,13500000.0,Kris Jenner has stuck the for-sale sign up on ...,"(34.1750198, -118.6646366)",34.175020,-118.664637
3,https://therealdeal.com/new-york/2024/05/10/bu...,Glossier founder flips Greenwich Village townh...,Beauty mogul asked $19M for 118 W 12th Street ...,By Sheridan Wall,2024-05-10 12:04:00,Celebrity Real Estate,new-york,"118 West 12th Street, NY",The sale price of the main property discussed ...,18000000.0,A beauty mogul found a buyer for her Greenwich...,"(40.7361296, -73.99868730000001)",40.736130,-73.998687
4,https://therealdeal.com/miami/2024/10/15/david...,Price revealed: David and Victoria Beckham pay...,Deal marks highest price for North Bay Road,By Katherine Kallergis,2024-10-15 14:30:00,Celebrity Real Estate,miami,"4736 North Bay Road, Miami Beach, FL",$72.3 million,72300000.0,David and Victoria Beckham paid $72.3 million ...,"(25.8245989, -80.1359051)",25.824599,-80.135905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,https://therealdeal.com/miami/2024/03/12/nba-s...,NBA star Dion Waiters sells Pinecrest estate,"He listed the 13,200 sf mansion for $18M last ...",By Kate Hinsche,2024-03-12 16:49:00,Celebrity Real Estate,miami,"5745 Southwest 94th Street, Pinecrest, FL",$11.1 million,11100000.0,Basketball star Dion Waiters sold his Pinecres...,"(25.6838897, -80.2857564)",25.683890,-80.285756
129,https://therealdeal.com/new-york/2025/02/10/go...,Former Google CEO’s Noho penthouse snags signe...,PHW at 25 Bond Street last asked $40M,By Sheridan Wall,2025-02-10 12:47:00,Celebrity Real Estate,new-york,"25 Bond Street, NY",The sale price of the main property discussed ...,40000000.0,Former Google CEO Eric Schmidt found a buyer f...,"(40.7261542, -73.99381520000001)",40.726154,-73.993815
130,https://therealdeal.com/new-york/tristate/2024...,Alec Baldwin stars in sales pitch for Amaganse...,"Ten-acre estate back on market, asking $19M af...",By TRD Staff,2024-01-16 13:00:00,Celebrity Real Estate,new-york,"335 Town Lane, Amagansett, Hamptons, NY",$19 million,19000000.0,After more than a year on the market and a\nse...,"(40.9865449, -72.145163)",40.986545,-72.145163
131,https://therealdeal.com/new-york/2024/07/08/ow...,Jardim penthouse seen in “Owning Manhattan” so...,West Chelsea condo formerly rented by rapper B...,By Sheridan Wall,2024-07-08 13:09:00,Celebrity Real Estate,new-york,"527 West 27th Street, NY",$15 million,15000000.0,A Chelsea penthouse that made headlines for a ...,"(40.7510547, -74.00350929999999)",40.751055,-74.003509


In [21]:
len(df)

127

In [22]:
df.to_csv("celebrity_data_cleaned.csv")

In [16]:
import pandas as pd
import folium
from folium.plugins import MarkerCluster, Fullscreen, MiniMap

import re

def create_popup_html(row: pd.Series) -> str:
    """
    Generates an HTML popup for a given DataFrame row,
    including the first sentence of the article.
    """
    title            = row.get('Title', 'No Title')
    subhead          = row.get('Subhead', 'N/A')
    authors          = row.get('Authors', 'Unknown')
    pub_date         = row.get('Published Date', 'N/A')
    subject_address  = row.get('Subject Address', 'N/A')
    sale_price_text  = row.get('Sale Price (text)', 'N/A')
    url              = row.get('URL', '#')

    # --- new: extract first sentence from Content ---
    full_content = row.get('Content', '')
    # regex to grab up through the first terminal punctuation
    m = re.match(r'(.+?[.!?])(\s|$)', full_content)
    first_sentence = m.group(1) if m else full_content.split('\n',1)[0]
    # --------------------------------------------------

    html = f"""
    <div class="popup-content">
        <h4 style="margin-bottom:5px;">{title}</h4>
        <p><strong>{subhead}</strong></p>
        <p><em>{authors}</em> | <strong>Published:</strong> {pub_date}</p>
        <p>{subject_address}</p>
        <p><strong>Sale Price:</strong> {sale_price_text}</p>
        <p>{first_sentence}...</p>
        <p><a href="{url}" target="_blank">Read more →</a></p>
    </div>
    """
    return html


# Assuming df is already defined and contains your data
# Use the first row's latitude and longitude to center the map
first_row = df.iloc[0]
map_center = [first_row['lat'], first_row['lon']]
# Center map on the geographic center of the contiguous US
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4, scrollWheelZoom=False)

# Add a custom Mapbox tile layer (replace with your actual Mapbox access token)
folium.TileLayer(
    tiles='https://api.mapbox.com/styles/v1/mapbox/streets-v11/tiles/256/{z}/{x}/{y}@2x?access_token=pk.eyJ1IjoidHJkZGF0YSIsImEiOiJjamc2bTc2YmUxY2F3MnZxZGh2amR2MTY5In0.QlOWqB-yQNrNlXD0KQ9IvQ',
    attr='Mapbox',
    name='Streets',
    overlay=True,
    control=False,
    show=False,
    min_zoom=1,
    max_zoom=20
).add_to(m)

# Add custom CSS to style the popups
custom_css = """
<style>
    .popup-content {
        min-width: 300px;
        font-size: 14px;
        line-height: 1.4;
        color: #333;
        white-space: normal;
        word-wrap: break-word;
    }
    .leaflet-popup, .leaflet-popup-content-wrapper {
        background-color: #f9f9f9;
        border: 1px solid #bbb;
        border-radius: 5px;
        padding: 8px;
        box-shadow: 0 2px 6px rgba(0,0,0,0.1);
    }
    .leaflet-popup-tip {
        background: #f9f9f9;
    }
</style>
"""
m.get_root().html.add_child(folium.Element(custom_css))

# Add a title to the map
title_html = '''
    <h3 style="text-align:center; font-family:Arial, sans-serif; font-size:18px; color:#333; margin-top:10px;">
        <b>Celebrity Map</b>
    </h3>
'''
m.get_root().html.add_child(folium.Element(title_html))

# Add additional map controls
Fullscreen().add_to(m)
MiniMap(toggle_display=True).add_to(m)

# Create a marker cluster
marker_cluster = MarkerCluster().add_to(m)

# Loop through the DataFrame to add markers
for idx, row in df.iterrows():
    lat = row['lat']
    lon = row['lon']
    popup_html = create_popup_html(row)
    
    folium.Marker(
        location=[lat, lon],
        popup=folium.Popup(popup_html, max_width=300),
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(marker_cluster)

# Display the map
m


In [19]:
m.save('index.html')

In [20]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/'

cwd = os.getcwd()

cwd = cwd.split('/')

final_name = base_name + cwd[-1]
print(final_name)

https://trd-digital.github.io/trd-news-interactive-maps/Celebrity_Data_Map_05_5_25


## Create GeoJSON

In [24]:
df = df.drop(columns='geocoded')

In [35]:
df['Sale Price (int)'].min()

1700000.0

Min: 1_700_000.0
Max: 100_000_000.0

In [36]:
df['Sale Price (int)'].median()

14500000.0

In [37]:
df['Sale Price (int)'].mean()

20632086.95652174

In [30]:
import geopandas as gpd
from shapely.geometry import Point

# 1. Create a geometry column of Point objects
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]

# 2. Build a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# Convert the datetime to ISO8601 text
gdf['Published Date'] = gdf['Published Date'].dt.strftime('%Y-%m-%dT%H:%M:%S')

# 3. (Optional) drop the raw lat/lon columns if you don’t need them anymore
#    gdf = gdf.drop(columns=['lat', 'lon'])

# 4. Write to a GeoJSON file
gdf.to_file("celebrity_sales.geojson", driver="GeoJSON")

# 5. Or, if you just want the GeoJSON as a Python string:
geojson_str = gdf.to_json()
