In [182]:
import pandas as pd

df = pd.read_csv('earthquake-data.csv')
df

Unnamed: 0,tgl,ot,lat,lon,depth,mag,remark,strike1,dip1,rake1,strike2,dip2,rake2
0,2008/11/01,21:02:43.058,-9.18,119.06,10,4.9,Sumba Region - Indonesia,,,,,,
1,2008/11/01,20:58:50.248,-6.55,129.64,10,4.6,Banda Sea,,,,,,
2,2008/11/01,17:43:12.941,-7.01,106.63,121,3.7,Java - Indonesia,,,,,,
3,2008/11/01,16:24:14.755,-3.30,127.85,10,3.2,Seram - Indonesia,,,,,,
4,2008/11/01,16:20:37.327,-6.41,129.54,70,4.3,Banda Sea,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92882,2023/01/26,02:25:09.288,3.24,127.18,10,4.0,Talaud Islands - Indonesia,,,,,,
92883,2023/01/26,02:15:03.893,2.70,127.10,10,3.9,Northern Molucca Sea,,,,,,
92884,2023/01/26,01:57:08.885,-7.83,121.07,10,3.8,Flores Sea,,,,,,
92885,2023/01/26,01:46:21.009,3.00,127.16,10,4.1,Northern Molucca Sea,,,,,,


In [183]:
translations = {
    'tgl': 'Date',
    'ot' : 'Origin Time',
    'lat' : 'Latitude',
    'lon' : 'Longitude',
    'depth' : 'Depth',
    'mag' : 'Magnitude',
    'remark' : 'Location',
}
df.rename(columns = translations, inplace = True)
df

Unnamed: 0,Date,Origin Time,Latitude,Longitude,Depth,Magnitude,Location,strike1,dip1,rake1,strike2,dip2,rake2
0,2008/11/01,21:02:43.058,-9.18,119.06,10,4.9,Sumba Region - Indonesia,,,,,,
1,2008/11/01,20:58:50.248,-6.55,129.64,10,4.6,Banda Sea,,,,,,
2,2008/11/01,17:43:12.941,-7.01,106.63,121,3.7,Java - Indonesia,,,,,,
3,2008/11/01,16:24:14.755,-3.30,127.85,10,3.2,Seram - Indonesia,,,,,,
4,2008/11/01,16:20:37.327,-6.41,129.54,70,4.3,Banda Sea,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92882,2023/01/26,02:25:09.288,3.24,127.18,10,4.0,Talaud Islands - Indonesia,,,,,,
92883,2023/01/26,02:15:03.893,2.70,127.10,10,3.9,Northern Molucca Sea,,,,,,
92884,2023/01/26,01:57:08.885,-7.83,121.07,10,3.8,Flores Sea,,,,,,
92885,2023/01/26,01:46:21.009,3.00,127.16,10,4.1,Northern Molucca Sea,,,,,,


In [184]:
# Dropping columns we don't need
df_drop = df.drop(['Date', 'Origin Time', 'strike1', 'dip1', 'rake1', 'strike2', 'dip2', 'rake2'], axis=1)
df = df_drop

# Dropping NAN values
df.dropna(inplace=True)
df


Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Location
0,-9.18,119.06,10,4.9,Sumba Region - Indonesia
1,-6.55,129.64,10,4.6,Banda Sea
2,-7.01,106.63,121,3.7,Java - Indonesia
3,-3.30,127.85,10,3.2,Seram - Indonesia
4,-6.41,129.54,70,4.3,Banda Sea
...,...,...,...,...,...
92882,3.24,127.18,10,4.0,Talaud Islands - Indonesia
92883,2.70,127.10,10,3.9,Northern Molucca Sea
92884,-7.83,121.07,10,3.8,Flores Sea
92885,3.00,127.16,10,4.1,Northern Molucca Sea


In [185]:
# Handling Outliers
df_numerical = df.iloc[:,0:4]
df_numerical

Unnamed: 0,Latitude,Longitude,Depth,Magnitude
0,-9.18,119.06,10,4.9
1,-6.55,129.64,10,4.6
2,-7.01,106.63,121,3.7
3,-3.30,127.85,10,3.2
4,-6.41,129.54,70,4.3
...,...,...,...,...
92882,3.24,127.18,10,4.0
92883,2.70,127.10,10,3.9
92884,-7.83,121.07,10,3.8
92885,3.00,127.16,10,4.1


In [186]:


def remove_outliers(df):
    df_no_outliers = df.copy()
    for i in df_no_outliers.columns:
        Q1 = df_no_outliers[i].quantile(0.25)
        Q3 = df_no_outliers[i].quantile(0.75)
        IQR = Q3-Q1

        upperBound = Q3 + (1.5*IQR) 
        lowerBound = Q1 - (1.5*IQR) 

        df_no_outliers = df_no_outliers[df_no_outliers[i] <= upperBound]
        df_no_outliers = df_no_outliers[df_no_outliers[i] >= lowerBound]

        df_no_outliers = df_no_outliers.reset_index(drop = True)
    return df_no_outliers

In [187]:
df_clean = remove_outliers(df_numerical)
df_comp = df[df.index.isin(df_clean.index)]
df = df_comp
df

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Location
0,-9.18,119.06,10,4.9,Sumba Region - Indonesia
1,-6.55,129.64,10,4.6,Banda Sea
2,-7.01,106.63,121,3.7,Java - Indonesia
3,-3.30,127.85,10,3.2,Seram - Indonesia
4,-6.41,129.54,70,4.3,Banda Sea
...,...,...,...,...,...
81218,-8.02,121.81,69,3.4,Flores Region - Indonesia
81219,1.50,127.85,10,3.7,Halmahera - Indonesia
81220,1.50,127.86,10,3.0,Halmahera - Indonesia
81221,1.48,127.86,10,2.7,Halmahera - Indonesia
