## Importing Necessary Libraries and Dependencies
----

In [1]:
# Importing essential libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns             ## New library

# Filter warnings
import warnings 
warnings.filterwarnings('ignore')



## Loading the Data
---

In [2]:
# Load the water dataset from CSV file
water_info_df = pd.read_csv('data/Cities1.csv')
water_info_df.head()

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York,New York,United States of America,46.816038,49.50495
1,Washington,District of Columbia,United States of America,66.129032,49.107143
2,San Francisco,California,United States of America,60.514019,43.0
3,Berlin,,Germany,62.36413,28.612717
4,Los Angeles,California,United States of America,36.621622,61.299435


In [3]:
# Load the cencus dataset from CSV file

cencus_info = pd.read_csv('data/us2021census.csv')
cencus_info.head()

Unnamed: 0,City,State,Type,Counties,Population,Latitude,Longitude
0,New York,NY,City,Bronx;Richmond;New York;Kings;Queens,8804190,40.714,-74.007
1,Los Angeles,CA,City,Los Angeles,3898747,34.052,-118.243
2,Chicago,IL,City,Cook;DuPage,2746388,41.882,-87.628
3,Houston,TX,City,Harris;Fort Bend;Montgomery,2304580,29.76,-95.363
4,Phoenix,AZ,City,Maricopa,1608139,33.448,-112.074


## Data Processing:
---

#### Before merging the data, the following adjustments were necessary::
1. Select Only the United States: Filter the data to include only entries from the United States.
2. Create a Dictionary of State Abbreviations: Develop a dictionary containing the descriptive names of states and their abbreviations.
3. Add a New Column for State Abbreviations: Create a new column in the `water dataframe` to insert state abbreviations. This step is crucial for accurately comparing cities and states across datasets and performing a more precise merge.
4. Create the normalize_city_name Function: Implement a function to normalize city names so that all cities are formatted consistently.
5. Analyze Unmatched Data: Investigate any data that did not match and check for potential KeyError issues.
6. Remove Null Values: Eliminate any null values from the datasets to ensure data integrity.
7. Remove Columns with Duplicate or Unused Data: Eliminate columns that contain duplicate information or data that will not be used in the project.

In [4]:
# Filter the data to include only entries from the United States
water_usa_df = water_info_df[(water_info_df['Country'] == 'United States of America')]
water_usa_df.head()

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York,New York,United States of America,46.816038,49.50495
1,Washington,District of Columbia,United States of America,66.129032,49.107143
2,San Francisco,California,United States of America,60.514019,43.0
4,Los Angeles,California,United States of America,36.621622,61.299435
12,Alexandria,Virginia,United States of America,89.0625,46.153846


In [5]:
# Create a Dictionary of State Abbreviation

state_abbreviations = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'District of Columbia': 'DC',	
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
    'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY'
}

water_usa_df['State'] = water_usa_df['Region'].map(state_abbreviations)

water_usa_df.head()

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution,State
0,New York,New York,United States of America,46.816038,49.50495,NY
1,Washington,District of Columbia,United States of America,66.129032,49.107143,DC
2,San Francisco,California,United States of America,60.514019,43.0,CA
4,Los Angeles,California,United States of America,36.621622,61.299435,CA
12,Alexandria,Virginia,United States of America,89.0625,46.153846,VA


In [6]:
# Count rowns
print(water_usa_df['City'].count())


842


In [7]:
# Function to Standardize Data for Merging

def normalize_city_name(city):
    # Remove Unwanted Punctuation
    city = re.sub(r'[.,\']', '', city) 
    city = re.sub(r'[-]', ' ', city) 
    
# Remove Multiple White Spaces
   # city = re.sub(r'\s+', ' ', city).strip()
    
    return city

# Applying function
water_usa_df['NormalizedCity'] = water_usa_df['City'].apply(normalize_city_name)
cencus_info['NormalizedCity'] = cencus_info['City'].apply(normalize_city_name)

In [8]:
# Merging the data
merge_result = pd.merge(water_usa_df, cencus_info, on=['NormalizedCity', 'State'], how='left')
merge_result.head()

Unnamed: 0,City_x,Region,Country,AirQuality,WaterPollution,State,NormalizedCity,City_y,Type,Counties,Population,Latitude,Longitude
0,New York,New York,United States of America,46.816038,49.50495,NY,New York,New York,City,Bronx;Richmond;New York;Kings;Queens,8804190.0,40.714,-74.007
1,Washington,District of Columbia,United States of America,66.129032,49.107143,DC,Washington,Washington,City,District of Columbia,689545.0,38.895,-77.036
2,San Francisco,California,United States of America,60.514019,43.0,CA,San Francisco,San Francisco,City,San Francisco,873965.0,37.78,-122.414
3,Los Angeles,California,United States of America,36.621622,61.299435,CA,Los Angeles,Los Angeles,City,Los Angeles,3898747.0,34.052,-118.243
4,Alexandria,Virginia,United States of America,89.0625,46.153846,VA,Alexandria,Alexandria,City,Alexandria,159467.0,38.805,-77.047


In [9]:
# Analyzing Unmatched Data
missing_data_df = merge_result[merge_result['Population'].isna()]
missing_data_df.head(50)

Unnamed: 0,City_x,Region,Country,AirQuality,WaterPollution,State,NormalizedCity,City_y,Type,Counties,Population,Latitude,Longitude
38,Brooklyn,New York,United States of America,52.083333,33.653846,NY,Brooklyn,,,,,,
61,Saint Paul,Minnesota,United States of America,88.541667,30.0,MN,Saint Paul,,,,,,
233,Oyster Bay,New York,United States of America,100.0,25.0,NY,Oyster Bay,,,,,,
342,Columbia,Maryland,United States of America,100.0,25.0,MD,Columbia,,,,,,
347,Amargosa Valley,Nevada,United States of America,25.0,100.0,NV,Amargosa Valley,,,,,,
739,Smithtown,New York,United States of America,100.0,50.0,NY,Smithtown,,,,,,
742,Islip,New York,United States of America,75.0,25.0,NY,Islip,,,,,,
833,Fredericksburg,Indiana,United States of America,100.0,25.0,IN,Fredericksburg,,,,,,
841,Carlisle,New York,United States of America,100.0,50.0,NY,Carlisle,,,,,,
842,New Windsor,New York,United States of America,100.0,50.0,NY,New Windsor,,,,,,


In [10]:
cities_to_search = ['Brooklyn', 'Saint Paul	', 'Oyster Bay', 'Columbia', 'Amargosa Valley', 'Smithtown', 'Islip', 'Fredericksburg', 'Carlisle', 
                    'New Windsor', 'Taylor', 'Germantown', 'Dickinson', 'Lafayette', 'Arlington']
for city in cities_to_search:
    cencus_find = cencus_info[cencus_info['City'].str.contains(city)]
    print(cencus_find)


                   City State     Type    Counties  Population  Latitude  \
408       Brooklyn Park    MN     City    Hennepin       86478    45.094   
1265    Brooklyn Center    MN     City    Hennepin       33782    45.076   
3293           Brooklyn    OH     City    Cuyahoga       11359    41.435   
4045           Brooklyn    CT     Town     Windham        8450    41.788   
7889           Brooklyn    IN     Town      Morgan        2511    39.542   
9923           Brooklyn    WI  Village  Dane;Green        1524    42.853   
9943   Brooklyn Heights    OH  Village    Cuyahoga        1519    41.420   
9991           Brooklyn    IA     City   Poweshiek        1502    41.731   
10582          Brooklyn    MI  Village     Jackson        1313    42.106   
13726          Brooklyn    IL  Village   St. Clair         649    38.654   
19452     West Brooklyn    IL  Village         Lee         131    41.693   
19963  Brooklyn Heights    MO     Town      Jasper         101    37.169   
20360     Ea

In [11]:
# Drop rows with null data
merge_result_df = merge_result.dropna(subset=['City_y'])
merge_result_df.head()

Unnamed: 0,City_x,Region,Country,AirQuality,WaterPollution,State,NormalizedCity,City_y,Type,Counties,Population,Latitude,Longitude
0,New York,New York,United States of America,46.816038,49.50495,NY,New York,New York,City,Bronx;Richmond;New York;Kings;Queens,8804190.0,40.714,-74.007
1,Washington,District of Columbia,United States of America,66.129032,49.107143,DC,Washington,Washington,City,District of Columbia,689545.0,38.895,-77.036
2,San Francisco,California,United States of America,60.514019,43.0,CA,San Francisco,San Francisco,City,San Francisco,873965.0,37.78,-122.414
3,Los Angeles,California,United States of America,36.621622,61.299435,CA,Los Angeles,Los Angeles,City,Los Angeles,3898747.0,34.052,-118.243
4,Alexandria,Virginia,United States of America,89.0625,46.153846,VA,Alexandria,Alexandria,City,Alexandria,159467.0,38.805,-77.047


In [12]:
## Remove Unnecessary Columns, Reorganize Column Order, and Create the Final DataFrame

# Drop columns
merge_final_df = merge_result_df.drop(['City_x', 'City_y', 'Type', 'Counties'], axis=1)

# Rename column
merge_final_df = merge_final_df.rename(columns={'NormalizedCity': 'City'})

# Reorganize DataFrame
water_pollution_df = merge_final_df[['City', 'Region', 'State', 'Country', 'Latitude', 'Longitude', 'Population', 'WaterPollution', 'AirQuality']]

water_pollution_df.head()


Unnamed: 0,City,Region,State,Country,Latitude,Longitude,Population,WaterPollution,AirQuality
0,New York,New York,NY,United States of America,40.714,-74.007,8804190.0,49.50495,46.816038
1,Washington,District of Columbia,DC,United States of America,38.895,-77.036,689545.0,49.107143,66.129032
2,San Francisco,California,CA,United States of America,37.78,-122.414,873965.0,43.0,60.514019
3,Los Angeles,California,CA,United States of America,34.052,-118.243,3898747.0,61.299435,36.621622
4,Alexandria,Virginia,VA,United States of America,38.805,-77.047,159467.0,46.153846,89.0625


In [13]:
# Export the DataFrame as a CSV file. 
water_pollution_df.to_csv("data/waterPollution.csv", encoding='utf8', index=False)

In [14]:
# Export the DataFrame as a json file. 
water_pollution_df.to_json(
    path_or_buf= "data/waterPollution.json", orient='records', lines=True, index=False             
)