In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.cluster as cluster
import sklearn.metrics as metrics
from mpl_toolkits.basemap import Basemap
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler  # For scaling dataset
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score 
from yellowbrick.cluster import silhouette_visualizer
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("earthquakes_2023_global.csv")
df.head(5)

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2023-01-01T00:49:25.294Z,52.0999,178.5218,82.77,3.1,ml,14.0,139.0,0.87,0.18,...,2023-03-11T22:51:52.040Z,"Rat Islands, Aleutian Islands, Alaska",earthquake,8.46,21.213,0.097,14.0,reviewed,us,us
1,2023-01-01T01:41:43.755Z,7.1397,126.738,79.194,4.5,mb,32.0,104.0,1.152,0.47,...,2023-03-11T22:51:45.040Z,"23 km ESE of Manay, Philippines",earthquake,5.51,7.445,0.083,43.0,reviewed,us,us
2,2023-01-01T03:29:31.070Z,19.1631,-66.5251,24.0,3.93,md,23.0,246.0,0.8479,0.22,...,2023-03-11T22:51:29.040Z,Puerto Rico region,earthquake,0.91,15.95,0.09,16.0,reviewed,pr,pr
3,2023-01-01T04:09:32.814Z,-4.7803,102.7675,63.787,4.3,mb,17.0,187.0,0.457,0.51,...,2023-03-11T22:51:45.040Z,"99 km SSW of Pagar Alam, Indonesia",earthquake,10.25,6.579,0.238,5.0,reviewed,us,us
4,2023-01-01T04:29:13.793Z,53.3965,-166.9417,10.0,3.0,ml,19.0,190.0,0.4,0.31,...,2023-03-11T22:51:38.040Z,"59 km SSW of Unalaska, Alaska",earthquake,1.41,1.999,0.085,18.0,reviewed,us,us


In [3]:
#get numerical columns
num_col = df.select_dtypes(include='float64')
num_col

Unnamed: 0,latitude,longitude,depth,mag,nst,gap,dmin,rms,horizontalError,depthError,magError,magNst
0,52.0999,178.5218,82.770,3.10,14.0,139.0,0.8700,0.18,8.46,21.213,0.097,14.0
1,7.1397,126.7380,79.194,4.50,32.0,104.0,1.1520,0.47,5.51,7.445,0.083,43.0
2,19.1631,-66.5251,24.000,3.93,23.0,246.0,0.8479,0.22,0.91,15.950,0.090,16.0
3,-4.7803,102.7675,63.787,4.30,17.0,187.0,0.4570,0.51,10.25,6.579,0.238,5.0
4,53.3965,-166.9417,10.000,3.00,19.0,190.0,0.4000,0.31,1.41,1.999,0.085,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...
26637,-6.9527,154.9829,10.000,5.20,72.0,60.0,3.9240,0.93,10.07,1.765,0.048,141.0
26638,32.3262,141.7386,10.000,5.10,74.0,121.0,1.8030,0.70,9.17,1.870,0.042,187.0
26639,-7.2411,68.0663,10.000,5.10,60.0,54.0,12.7760,0.57,8.02,1.792,0.090,40.0
26640,-19.1602,169.0428,153.264,4.70,40.0,61.0,3.7460,0.82,8.52,7.433,0.081,46.0


In [7]:
num_col.dropna()
num_col.isnull().sum()

latitude              0
longitude             0
depth                 0
mag                   0
nst                1415
gap                1417
dmin               1866
rms                   0
horizontalError    1549
depthError            0
magError           1672
magNst             1577
dtype: int64

In [8]:
from sklearn.experimental import enable_iterative_imputer #must be included to enable the usage of the imputer
from sklearn.impute import IterativeImputer

num_iterimpute = IterativeImputer(random_state=0)
clean_num_col = pd.DataFrame(num_iterimpute.fit_transform(num_col))
clean_num_col.columns = num_col.columns

display(num_col.isna().sum())
display(clean_num_col.isna().sum())

latitude              0
longitude             0
depth                 0
mag                   0
nst                1415
gap                1417
dmin               1866
rms                   0
horizontalError    1549
depthError            0
magError           1672
magNst             1577
dtype: int64

latitude           0
longitude          0
depth              0
mag                0
nst                0
gap                0
dmin               0
rms                0
horizontalError    0
depthError         0
magError           0
magNst             0
dtype: int64

In [11]:
from sklearn.preprocessing import StandardScaler

# Select the columns to exclude from normalization
columns_to_exclude = ['latitude', 'longitude']

# Select the columns to normalize
columns_to_normalize = [col for col in clean_num_col.columns if col not in columns_to_exclude]

# Fit the scaler on the selected columns
scaler = MinMaxScaler().fit(clean_num_col[columns_to_normalize])

# Transform the selected columns
newdf_norm = clean_num_col.copy()
newdf_norm[columns_to_normalize] = scaler.transform(clean_num_col[columns_to_normalize])

# Combine with untouched columns
for col in columns_to_exclude:
    newdf_norm[col] = clean_num_col[col]


newdf_norm

Unnamed: 0,latitude,longitude,depth,mag,nst,gap,dmin,rms,horizontalError,depthError,magError,magNst
0,52.0999,178.5218,0.130856,-1.142229,-0.760630,0.185908,-0.410369,-1.566996,0.408531,3.760013,-0.257739,-0.414573
1,7.1397,126.7380,0.100229,0.620091,-0.273747,-0.342774,-0.339414,-0.435381,-0.319755,0.667168,-0.398997,0.206473
2,19.1631,-66.5251,-0.372483,-0.097425,-0.517188,1.802166,-0.415929,-1.410911,-1.455388,2.577732,-0.328368,-0.371743
3,-4.7803,102.7675,-0.031725,0.368331,-0.679483,0.910959,-0.514285,-0.279296,0.850440,0.472630,1.164926,-0.607312
4,53.3965,-166.9417,-0.492387,-1.268109,-0.625385,0.956274,-0.528627,-1.059720,-1.331949,-0.556221,-0.378817,-0.328912
...,...,...,...,...,...,...,...,...,...,...,...,...
26637,-6.9527,154.9829,-0.492387,1.501251,0.808216,-1.007403,0.358057,1.359596,0.806002,-0.608787,-0.752141,2.305181
26638,32.3262,141.7386,-0.492387,1.375371,0.862314,-0.085985,-0.175614,0.462108,0.583813,-0.585200,-0.812680,3.290288
26639,-7.2411,68.0663,-0.492387,1.375371,0.483627,-1.098035,2.585335,-0.045168,0.299905,-0.602722,-0.328368,0.142227
26640,-19.1602,169.0428,0.734606,0.871851,-0.057354,-0.992298,0.313270,0.930362,0.423343,0.664473,-0.419176,0.270719
