# Cleaning Earthquakes Raw Data

In [231]:
import os
import pandas as pd
from sqlalchemy import create_engine

POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_DB = os.environ.get("POSTGRES_DB")
POSTGRES_HOST = os.environ.get("POSTGRES_HOST")
POSTGRES_PORT = os.environ.get("POSTGRES_PORT")

conn = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
engine = create_engine(conn)

In [232]:
query = "select * from raw_data;"

df = pd.read_sql_query(query, conn)

df.describe()

Unnamed: 0,id,magnitude,latitude,longitude,depth,utc_time
count,179623.0,179623.0,179623.0,179623.0,179623.0,179623
mean,89812.0,2.154777,39.229089,-107.433612,34.049668,2023-06-11 01:20:42.432082688
min,1.0,1.0,-65.8497,-179.9987,-3.74,2022-08-07 01:56:38
25%,44906.5,1.3,31.618324,-153.2038,5.51,2023-01-02 06:06:59.500000
50%,89812.0,1.8,39.156333,-137.634,10.0,2023-06-03 14:35:37
75%,134717.5,2.5,59.7907,-110.293167,35.0,2023-11-14 22:18:08
max,179623.0,7.8,86.5939,179.9994,681.24,2024-05-02 19:18:10
std,51852.838039,1.138359,23.113004,84.495356,65.746675,


## Remove region text from country

In [233]:
df["country"].str.contains("region")

df["country"] = df["country"].str.replace("region", "").str.strip()

df["country"].str.contains("region").sum()


0

In [234]:
df["country"].str.contains("of").sum()

pattern = r'north|south|east|west'

mask = df["country"].str.contains(pattern)
mask &= df["country"].str.contains("Islands")

df.loc[mask, "country"] = df.loc[mask, "country"].str.split().str[-2:].str.join(" ")


In [235]:
mask = df["place"].str.contains("Fiji") & df["country"].isnull()
df.loc[mask, "country"] = "Fiji"

In [236]:
mask = df["place"].str.contains("Fiji") & df["country"].str.contains("Islands")

df.loc[mask, "country"] = "Fiji"

mask.sum()

608

In [237]:
mask = df["country"].str.contains("Tonga") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Tonga"

mask.sum()

51

In [238]:
mask = df["country"].str.contains("Alaska") | df["country"].str.contains("Texas") | df["country"].str.contains("California")

df.loc[mask, "country"] = "USA"

mask.sum()

3894

In [239]:
mask = df["city"].str.contains("Macquarie") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Macquarie Island"

mask.sum()

51

In [240]:
mask = df["city"].str.contains("Ascension") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Ascension Island"

mask.sum()

35

In [241]:
mask = df["city"].str.contains("America") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Central America"

mask.sum()

40

In [242]:
to_change = ["Taiwan", "Venezuela", "Honduras", "Severnaya Zemlya", "Greenland", "Azerbaijan", "Guatemala", "Panama", "Svalbard", "Ecuador", "New Zealand", "Australia", "Chile", "Turkey", "Colombia", "Myanmar", "Oman", "Peru", "Japan", "Africa", "Nicaragua", "Franz Josef Land", "Syria", "Libya", "Easter Island"]

for country in to_change:
    mask = df["country"].str.contains(country) & df["country"].str.contains("of")

    df.loc[mask, "country"] = country
    print(f"{country} - {mask.sum()}")

Taiwan - 3
Venezuela - 3
Honduras - 1
Severnaya Zemlya - 19
Greenland - 3
Azerbaijan - 1
Guatemala - 7
Panama - 18
Svalbard - 18
Ecuador - 13
New Zealand - 28
Australia - 6
Chile - 1
Turkey - 1
Colombia - 2
Myanmar - 1
Oman - 1
Peru - 12
Japan - 2
Africa - 34
Nicaragua - 8
Franz Josef Land - 2
Syria - 1
Libya - 2
Easter Island - 38


In [243]:
df["country"].isnull().sum()



0

In [244]:
df["country"].value_counts().head(25)

country
USA                         144510
Puerto Rico                   5921
Indonesia                     2767
Japan                         2044
Philippines                   1784
Fiji                          1469
Papua New Guinea              1181
Mexico                        1093
Tonga                         1070
Chile                          972
Canada                         808
Vanuatu                        716
Russia                         696
Northern Mariana Islands       642
Turkey                         638
South Sandwich Islands         607
Kermadec Islands               567
China                          517
U.S. Virgin Islands            495
Reykjanes Ridge                476
Argentina                      450
Taiwan                         448
New Zealand                    412
Solomon Islands                407
Peru                           403
Name: count, dtype: int64

In [245]:
new_table = "updated_data"

df.to_sql(new_table, engine, if_exists="replace")

623

In [246]:
engine.dispose()