# String Operations

## Introduction

## Learning Objectives

## Creating and Manipulating Strings

### Creating Strings

In [None]:
# Different ways to create strings
location_name = "Mount Everest"  # Using double quotes
country = "Nepal"  # Using single quotes
description = """Mount Everest is the highest peak
in the world, located in the Himalayas."""  # Multi-line string

print(f"Location: {location_name}")
print(f"Country: {country}")
print(f"Description: {description}")

Location: Mount Everest
Country: Nepal
Description: Mount Everest is the highest peak
in the world, located in the Himalayas.


### String Concatenation

In [None]:
# Basic concatenation using the + operator
location_full = location_name + ", " + country
print(f"Full location: {location_full}")

# Building a file path
data_folder = "geographic_data"
filename = "mountain_peaks.csv"
file_path = data_folder + "/" + filename
print(f"File path: {file_path}")

Full location: Mount Everest, Nepal
File path: geographic_data/mountain_peaks.csv


### String Repetition

In [None]:
# Create visual separators
separator = "-" * 30
print(separator)
print("Geographic Data Report")
print(separator)

# Create formatted spacing
tab_space = " " * 4
print(f"Location:{tab_space}{location_name}")
print(f"Elevation:{tab_space}8,848 meters")

------------------------------
Geographic Data Report
------------------------------
Location:    Mount Everest
Elevation:    8,848 meters


### String Length and Basic Properties

In [None]:
# Get the length of a string
location_length = len(location_name)
print(f"The location name '{location_name}' has {location_length} characters")

# Check if a string contains only letters
city_name = "SanFrancisco"
print(f"Is '{city_name}' alphabetic? {city_name.isalpha()}")

# Check if a string contains only digits (useful for coordinate validation)
zip_code = "94102"
print(f"Is '{zip_code}' numeric? {zip_code.isdigit()}")

The location name 'Mount Everest' has 13 characters
Is 'SanFrancisco' alphabetic? True
Is '94102' numeric? True


### Building Dynamic Content

In [None]:
# Building dynamic location descriptions
latitude = 27.9881
longitude = 86.9250
elevation = 8848

location_info = (
    location_name
    + " is located at coordinates "
    + str(latitude)
    + ", "
    + str(longitude)
)
print(location_info)

# A more complex example - building a geographic summary
cities = ["Kathmandu", "Pokhara", "Lalitpur"]
summary = "Major cities in " + country + " include: " + ", ".join(cities)
print(summary)

Mount Everest is located at coordinates 27.9881, 86.925
Major cities in Nepal include: Kathmandu, Pokhara, Lalitpur


## String Methods for Geospatial Data

### Case Conversion Methods

In [None]:
# Case conversion examples
mountain_name = "Mount Everest"

# Convert to different cases
print(f"Original: {mountain_name}")
print(f"Uppercase: {mountain_name.upper()}")
print(f"Lowercase: {mountain_name.lower()}")
print(f"Title case: {mountain_name.title()}")
print(f"Capitalize: {mountain_name.capitalize()}")

Original: Mount Everest
Uppercase: MOUNT EVEREST
Lowercase: mount everest
Title case: Mount Everest
Capitalize: Mount everest


### Whitespace Removal Methods

In [None]:
# Whitespace removal examples
messy_location = "   San Francisco   "
messy_left = "   Los Angeles"
messy_right = "Chicago   "

print(f"Original: '{messy_location}'")
print(f"strip(): '{messy_location.strip()}'")  # Remove both sides
print(f"lstrip(): '{messy_left.lstrip()}'")  # Remove left side
print(f"rstrip(): '{messy_right.rstrip()}'")  # Remove right side

Original: '   San Francisco   '
strip(): 'San Francisco'
lstrip(): 'Los Angeles'
rstrip(): 'Chicago'


### String Replacement

In [None]:
# Basic replacement
location = "Mount Everest, Nepal"
updated_location = location.replace("Everest", "Kilimanjaro")
print(f"Original: {location}")
print(f"Updated: {updated_location}")

# Replace multiple occurrences
path_string = "data/raw_data/geographic_data/raw_data/points.csv"
clean_path = path_string.replace("raw_data/", "")
print(f"Original path: {path_string}")
print(f"Clean path: {clean_path}")

Original: Mount Everest, Nepal
Updated: Mount Kilimanjaro, Nepal
Original path: data/raw_data/geographic_data/raw_data/points.csv
Clean path: data/geographic_data/points.csv


### String Splitting

In [None]:
# Basic splitting
location_full = "Mount Everest, Nepal, Asia"
location_parts = location_full.split(", ")
print(f"Original: {location_full}")
print(f"Split into parts: {location_parts}")

# Extract individual components
mountain, country, continent = location_parts
print(f"Mountain: {mountain}")
print(f"Country: {country}")
print(f"Continent: {continent}")

Original: Mount Everest, Nepal, Asia
Split into parts: ['Mount Everest', 'Nepal', 'Asia']
Mountain: Mount Everest
Country: Nepal
Continent: Asia


In [None]:
# Splitting coordinate strings
coordinate_string = "40.7128,-74.0060"
lat_str, lon_str = coordinate_string.split(",")
latitude = float(lat_str)
longitude = float(lon_str)
print(f"Parsed coordinates: Lat={latitude}, Lon={longitude}")
latitude


Parsed coordinates: Lat=40.7128, Lon=-74.006


40.7128

In [None]:
# Splitting file paths
file_path = "data/geographic/cities/world_cities.csv"
path_components = file_path.split("/")
print(f"Path components: {path_components}")
print(f"Filename: {path_components[-1]}")  # Last component is the filename

Path components: ['data', 'geographic', 'cities', 'world_cities.csv']
Filename: world_cities.csv


### String Joining

In [None]:
# Basic joining
city_names = ["San Francisco", "New York", "Tokyo"]
city_name = ", ".join(city_names)
print(f"Joined city name: {city_name}")

Joined city name: San Francisco, New York, Tokyo


In [None]:
# Creating file paths
path_parts = ["data", "geographic", "elevation", "dem.tif"]
full_path = "/".join(path_parts)
print(f"Full path: {full_path}")

Full path: data/geographic/elevation/dem.tif


In [None]:
# Practical example: creating coordinate strings
coordinates = ["40.7128", "-74.0060"]
coordinate_string = ",".join(coordinates)
print(f"Coordinate string: {coordinate_string}")
coordinates

Coordinate string: 40.7128,-74.0060


['40.7128', '-74.0060']

## String Formatting

### F-String Formatting (Recommended)

In [None]:
# Basic f-string formatting with geographic data
location = "Mount Everest"
latitude = 27.9881
longitude = 86.9250
elevation = 8848

# Simple variable insertion
location_info = f"Location: {location}"
print(location_info)

# Multiple variables
coordinates = f"Coordinates: ({latitude}, {longitude})"
print(coordinates)

# Complete geographic summary
summary = f"{location} is located at {latitude}°N, {longitude}°E with an elevation of {elevation} meters"
print(summary)

Location: Mount Everest
Coordinates: (27.9881, 86.925)
Mount Everest is located at 27.9881°N, 86.925°E with an elevation of 8848 meters


### Formatting Numbers in Strings

In [None]:
# Controlling decimal places for coordinates
precise_lat = 40.712776
precise_lon = -74.005974

# Round to different decimal places
coords_2_places = f"Coordinates: ({precise_lat:.2f}, {precise_lon:.2f})"
coords_4_places = f"Coordinates: ({precise_lat:.4f}, {precise_lon:.4f})"

print(coords_2_places)
print(coords_4_places)

# Adding thousands separators for large numbers
population = 8336817
area_sqkm = 783.8

formatted_stats = f"NYC Population: {population:,} people, Area: {area_sqkm:.1f} km²"
print(formatted_stats)

Coordinates: (40.71, -74.01)
Coordinates: (40.7128, -74.0060)
NYC Population: 8,336,817 people, Area: 783.8 km²


### Legacy Formatting Methods

In [None]:
# Using .format() method
location = "San Francisco"
lat = 37.7749
lon = -122.4194

# Basic format method
formatted_1 = "Location: {} at coordinates ({}, {})".format(location, lat, lon)
print(formatted_1)

# With positional arguments
formatted_2 = "Location: {0} at coordinates ({1}, {2})".format(location, lat, lon)
print(formatted_2)

# With named arguments
formatted_3 = "Location: {name} at coordinates ({latitude}, {longitude})".format(
    name=location, latitude=lat, longitude=lon
)
print(formatted_3)

Location: San Francisco at coordinates (37.7749, -122.4194)
Location: San Francisco at coordinates (37.7749, -122.4194)
Location: San Francisco at coordinates (37.7749, -122.4194)


### Practical Formatting Examples

In [None]:
# Creating file names with timestamps and coordinates
import datetime

current_time = datetime.datetime.now()
survey_lat = 45.3311
survey_lon = -121.7113

filename = f"survey_{current_time.strftime('%Y%m%d')}_{survey_lat:.4f}N_{abs(survey_lon):.4f}W.csv"
print(f"Generated filename: {filename}")

# Creating Well-Known Text (WKT) representations
wkt_point = f"POINT({survey_lon} {survey_lat})"
print(f"WKT Point: {wkt_point}")

Generated filename: survey_20260202_45.3311N_121.7113W.csv
WKT Point: POINT(-121.7113 45.3311)


In [None]:
# Building SQL queries with formatting
table_name = "cities"
min_population = 1000000
region = "North America"

sql_query = f"""SELECT name, latitude, longitude
FROM {table_name}
WHERE population > {min_population:,}
AND region = '{region}'"""

print("Generated SQL Query:")
print(sql_query)

Generated SQL Query:
SELECT name, latitude, longitude
FROM cities
WHERE population > 1,000,000
AND region = 'North America'


## String Operation Decision Guide

### When to Use Each Operation

## Key Takeaways

## Exercises

### Exercise 1: Manipulating Geographic Location Strings

In [None]:
feature = "Amazon River"
country_name = "Brazil"

# Convert to lowercase and uppercase
feature_lower = feature.lower()
feature_upper = feature.upper()
print(f"Original: {feature}")
print(f"Lowercase: {feature_lower}")
print(f"Uppercase: {feature_upper}")

# Concatenate with country name
full_location = feature + ", " + country_name
print(f"Full location: {full_location}")

# Repeat the string three times with dashes
repeated_feature = (feature + " - ") * 2 + feature
print(f"Repeated feature: {repeated_feature}")

Original: Amazon River
Lowercase: amazon river
Uppercase: AMAZON RIVER
Full location: Amazon River, Brazil
Repeated feature: Amazon River - Amazon River - Amazon River


### Exercise 2: Extracting and Formatting Coordinates

In [None]:
#Given a string with the format "latitude, longitude" (e.g., "40.7128N, 74.0060W" ), extract the
#numeric values of latitude and longitude.
#Convert these values to floats and remove the directional indicators ( N , S , E , W ).
#Format the coordinates into a POINT WKT string (e.g., "POINT(-74.0060 40.7128)" ).

In [None]:
# Given string with directional indicators
coordinate_string_directional = "40.7128N, 74.0060W"

# Split into latitude and longitude parts
lat_str_directional, lon_str_directional = coordinate_string_directional.split(", ")

# Remove directional indicators and convert to float
# For latitude, remove 'N' or 'S'
latitude_numeric = float(lat_str_directional.replace('N', '').replace('S', ''))

# For longitude, remove 'E' or 'W'. If it's 'W', it should be negative.
# Assuming 'W' indicates negative longitude, and 'E' indicates positive.
longitude_numeric = float(lon_str_directional.replace('W', '').replace('E', ''))
# Apply negative sign if original had 'W'
if 'W' in lon_str_directional: # Check if 'W' was in the original string
    longitude_numeric = -abs(longitude_numeric)


print(f"Original string: {coordinate_string_directional}")
print(f"Extracted Latitude (float): {latitude_numeric}")
print(f"Extracted Longitude (float): {longitude_numeric}")

# Format into a POINT WKT string: "POINT(longitude latitude)"
wkt_point = f"POINT({longitude_numeric} {latitude_numeric})"
print(f"WKT Point String: {wkt_point}")

Original string: 40.7128N, 74.0060W
Extracted Latitude (float): 40.7128
Extracted Longitude (float): -74.006
WKT Point String: POINT(-74.006 40.7128)


### Exercise 3: Building Dynamic SQL Queries

In [None]:
#Given a table name and a condition, dynamically build an SQL query string.
#Example: If table_name = "cities" and condition = "population > 1000000" , the query
#should be "SELECT * FROM cities WHERE population > 1000000;" .
#Add additional conditions dynamically, like AND clauses.

In [None]:
table_name = "cities"
condition_1 = "population > 1000000"

# Build the base SQL query
sql_query_base = f"SELECT * FROM {table_name} WHERE {condition_1};"
print("Base SQL Query:")
print(sql_query_base)

# Add additional conditions dynamically
condition_2 = "region = 'Europe'"
condition_3 = "is_capital = TRUE"

# Using f-strings for dynamic query building with AND clauses
sql_query_multi_condition = f"SELECT * FROM {table_name} WHERE {condition_1} AND {condition_2} AND {condition_3};"
print("\nSQL Query with multiple conditions:")
print(sql_query_multi_condition)

# A more flexible way to build queries with an arbitrary number of conditions
def build_sql_query(table, conditions):
    if not conditions:
        return f"SELECT * FROM {table};"
    conditions_str = " AND ".join(conditions)
    return f"SELECT * FROM {table} WHERE {conditions_str};"

conditions_list = ["population > 500000", "country = 'USA'", "elevation_m < 500"]
sql_query_from_list = build_sql_query("landmarks", conditions_list)
print("\nSQL Query built from a list of conditions:")
print(sql_query_from_list)

sql_query_no_conditions = build_sql_query("countries", [])
print("\nSQL Query with no conditions:")
print(sql_query_no_conditions)

Base SQL Query:
SELECT * FROM cities WHERE population > 1000000;

SQL Query with multiple conditions:
SELECT * FROM cities WHERE population > 1000000 AND region = 'Europe' AND is_capital = TRUE;

SQL Query built from a list of conditions:
SELECT * FROM landmarks WHERE population > 500000 AND country = 'USA' AND elevation_m < 500;

SQL Query with no conditions:
SELECT * FROM countries;


### Exercise 4: String Normalization and Cleaning

In [None]:
#Given a list of city names with inconsistent formatting (e.g.,
#[" new york ", "Los ANGELES", " CHICAGO"] ), normalize the names by:
#‣ Stripping any leading or trailing whitespace.
#‣ Converting them to title case (e.g., "New York" , "Los Angeles" , "Chicago" ).
#• Ensure that the output is a clean list of city names.

In [4]:
city_names_raw = [" new york ", "Los ANGELES", " CHICAGO ", "london ", "san francisco"]

# Normalize the city names
def normalize_city_names(names_list):
    normalized_names = []
    for name in names_list:
        # Strip leading/trailing whitespace and convert to title case
        cleaned_name = name.strip().title()
        normalized_names.append(cleaned_name)
    return normalized_names

normalized_cities = normalize_city_names(city_names_raw)

print(f"Original city names: {city_names_raw}")
print(f"Normalized city names: {normalized_cities}")

Original city names: [' new york ', 'Los ANGELES', ' CHICAGO ', 'london ', 'san francisco']
Normalized city names: ['New York', 'Los Angeles', 'Chicago', 'London', 'San Francisco']


### Exercise 5: Parsing and Extracting Address Information

In [5]:
#Given a string in the format "Street, City, Country" (e.g.,"123 Main St, Springfield, USA" ), write a function that parses the string into a dictionary with
#keys street , city , and country .
#The function should return a dictionary like
#{"street": "123 Main St", "city": "Springfield", "country": "USA"} .

In [7]:
def parse_address_string(address_string):
    """
    Parses an address string in the format 'Street, City, Country'
    into a dictionary with keys 'street', 'city', and 'country'.
    """
    parts = [part.strip() for part in address_string.split(',')]
    if len(parts) == 3:
        return {"street": parts[0], "city": parts[1], "country": parts[2]}
    else:
        return None # Or raise an error for invalid format

# Example usage:
address = "123 Main St, Springfield, USA"
parsed_address = parse_address_string(address)
print(f"Original address string: '{address}'")
print(f"Parsed address dictionary: {parsed_address}")

address_2 = "456 Oak Ave, Metropolis, Canada"
parsed_address_2 = parse_address_string(address_2)
print(f"\nOriginal address string: '{address_2}'")
print(f"Parsed address dictionary: {parsed_address_2}")

# Example of an invalid format
address_invalid = "789 Pine Ln, Gotham"
parsed_address_invalid = parse_address_string(address_invalid)
print(f"\nOriginal address string (invalid): '{address_invalid}'")
print(f"Parsed address dictionary: {parsed_address_invalid}")

Original address string: '123 Main St, Springfield, USA'
Parsed address dictionary: {'street': '123 Main St', 'city': 'Springfield', 'country': 'USA'}

Original address string: '456 Oak Ave, Metropolis, Canada'
Parsed address dictionary: {'street': '456 Oak Ave', 'city': 'Metropolis', 'country': 'Canada'}

Original address string (invalid): '789 Pine Ln, Gotham'
Parsed address dictionary: None
