# Cleaning Earthquakes Raw Data

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine

POSTGRES_USER = os.environ.get("POSTGRES_USER")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
POSTGRES_DB = os.environ.get("POSTGRES_DB")
POSTGRES_HOST = os.environ.get("POSTGRES_HOST")
POSTGRES_PORT = os.environ.get("POSTGRES_PORT")

conn = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
engine = create_engine(conn)

In [2]:
query = "select * from raw_data;"

df = pd.read_sql_query(query, conn)


In [3]:
df.drop(columns=["code","event_id","url", "details"], axis=1, inplace=True)


In [4]:
df.set_index("id", inplace=True)

In [5]:
df

Unnamed: 0_level_0,place,city,country,magnitude,latitude,longitude,depth,utc_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,"14 km NNE of Virginia City, Nevada","Virginia City, Nevada",USA,1.0,39.421900,-119.557600,10.80,2024-01-27 11:27:58
2,"45 km NW of Toyah, Texas","Toyah, Texas",USA,1.9,31.619000,-104.117000,6.08,2024-01-27 11:39:20
3,"54 km NNE of Kobuk, Alaska","Kobuk, Alaska",USA,1.5,67.322500,-156.238600,4.40,2024-01-27 11:44:12
4,"279 km WNW of Houma, Tonga",Houma,Tonga,4.2,-20.158800,-177.761800,486.33,2024-01-27 11:50:11
5,"4 km SSW of Salcha, Alaska","Salcha, Alaska",USA,2.4,64.482800,-146.941300,8.50,2024-01-27 11:50:33
...,...,...,...,...,...,...,...,...
198135,"8 km E of Alum Rock, California","Alum Rock, California",USA,1.0,37.369500,-121.737667,4.50,2024-07-25 21:22:16
198136,"12 km W of Chilliwack, Canada",Chilliwack,Canada,1.4,49.147500,-122.117500,-0.69,2024-07-25 19:00:02
198137,"10 km of Pittsburg, California","Pittsburg, California",USA,1.8,38.116500,-121.915833,26.10,2024-07-25 11:02:24
198138,"10 km SE of Sunol, California","Sunol, California",USA,1.1,37.567500,-121.784333,6.22,2024-07-25 06:57:13


## Remove region text from country

In [6]:
df["country"].str.contains("region")

df["country"] = df["country"].str.replace("region", "").str.strip()

df["country"].str.contains("region").sum()


0

In [7]:
df["country"].str.contains("of").sum()

pattern = r'north|south|east|west'

mask = df["country"].str.contains(pattern)
mask &= df["country"].str.contains("Islands")

df.loc[mask, "country"] = df.loc[mask, "country"].str.split().str[-2:].str.join(" ")


In [8]:
mask = df["place"].str.contains("Fiji") & df["country"].isnull()
df.loc[mask, "country"] = "Fiji"

In [9]:
mask = df["place"].str.contains("Fiji") & df["country"].str.contains("Islands")

df.loc[mask, "country"] = "Fiji"

mask.sum()

632

In [10]:
mask = df["country"].str.contains("Tonga") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Tonga"

mask.sum()

51

In [11]:
mask = df["country"].str.contains("Alaska") | df["country"].str.contains("Texas") | df["country"].str.contains("California")

df.loc[mask, "country"] = "USA"

mask.sum()

3896

In [12]:
mask = df["city"].str.contains("Macquarie") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Macquarie Island"

mask.sum()

51

In [13]:
mask = df["city"].str.contains("Ascension") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Ascension Island"

mask.sum()

35

In [14]:
mask = df["city"].str.contains("America") & df["country"].str.contains("of")

df.loc[mask, "country"] = "Central America"

mask.sum()

40

In [15]:
to_change = ["Taiwan", "Venezuela", "Honduras", "Severnaya Zemlya", "Greenland", "Azerbaijan", "Guatemala", "Panama", "Svalbard", "Ecuador", "New Zealand", "Australia", "Chile", "Turkey", "Colombia", "Myanmar", "Oman", "Peru", "Japan", "Africa", "Nicaragua", "Franz Josef Land", "Syria", "Libya", "Easter Island"]

for country in to_change:
    mask = df["country"].str.contains(country) & df["country"].str.contains("of")

    df.loc[mask, "country"] = country
    print(f"{country} - {mask.sum()}")

Taiwan - 3
Venezuela - 3
Honduras - 1
Severnaya Zemlya - 19
Greenland - 3
Azerbaijan - 1
Guatemala - 7
Panama - 18
Svalbard - 18
Ecuador - 13
New Zealand - 28
Australia - 6
Chile - 1
Turkey - 1
Colombia - 2
Myanmar - 1
Oman - 1
Peru - 12
Japan - 2
Africa - 34
Nicaragua - 8
Franz Josef Land - 2
Syria - 1
Libya - 2
Easter Island - 38


In [16]:
df["country"].isnull().sum()



0

In [17]:
df["country"].value_counts().head(25)

country
USA                         160555
Puerto Rico                   6379
Indonesia                     2933
Japan                         2176
Philippines                   1851
Fiji                          1533
Papua New Guinea              1259
Mexico                        1142
Tonga                         1126
Chile                         1084
Canada                         886
Vanuatu                        764
Russia                         753
Northern Mariana Islands       654
Turkey                         653
South Sandwich Islands         632
Kermadec Islands               582
China                          553
U.S. Virgin Islands            526
Argentina                      494
Taiwan                         492
Reykjanes Ridge                477
Peru                           463
New Zealand                    435
Solomon Islands                420
Name: count, dtype: int64

In [18]:
new_table = "updated_data"

df.to_sql(new_table, engine, if_exists="replace")

139

In [19]:
engine.dispose()