## 1. Read the raw data into a pandas dataframe. 


In [10]:
# Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Set pandas optino to view complete dataframe
pd.set_option('display.max_columns', 23)

In [11]:
dataset_location = './Earthquakes'
file_name = 'quakes.csv'

In [12]:
df = pd.read_csv("quakes.csv")

In [13]:
# View the top 5 rows of the dataframe
df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2023-11-15T09:05:07.304Z,61.5808,-149.847,32.8,1.7,ml,,,,0.22,ak,ak023enptp4a,2023-11-15T09:06:28.283Z,"5 km SSW of Houston, Alaska",earthquake,,0.2,,,automatic,ak,ak
1,2023-11-15T08:53:06.688Z,61.0794,-147.883,14.8,1.0,ml,,,,0.8,ak,ak023enpiju4,2023-11-15T08:54:38.102Z,"55 km NE of Whittier, Alaska",earthquake,,0.3,,,automatic,ak,ak
2,2023-11-15T08:41:52.480Z,19.380667,-155.285339,0.32,1.73,md,15.0,153.0,,0.2,hv,hv73648602,2023-11-15T08:56:22.252Z,"8 km SW of Volcano, Hawaii",earthquake,0.33,0.38,0.59,15.0,automatic,hv,hv
3,2023-11-15T07:44:53.035Z,61.6382,-149.7828,32.9,1.9,ml,,,,0.31,ak,ak023enovad3,2023-11-15T07:46:10.981Z,,earthquake,,0.2,,,automatic,ak,ak
4,2023-11-15T07:19:44.540Z,18.972166,-155.45166,34.759998,1.87,md,37.0,236.0,,0.12,hv,hv73648502,2023-11-15T07:22:58.830Z,"17 km SE of Naalehu, Hawaii",earthquake,0.71,0.89,0.88,5.0,automatic,hv,hv


## 2. Perform necessary cleanups, ensuring appropriate data types for each column. 


### 2.1 Appropriate Data Type

In [14]:
# Check the current dtype of all the columns
df.dtypes

time                object
latitude           float64
longitude          float64
depth              float64
mag                float64
magType             object
nst                float64
gap                float64
dmin               float64
rms                float64
net                 object
id                  object
updated             object
place               object
type                object
horizontalError    float64
depthError         float64
magError           float64
magNst             float64
status              object
locationSource      object
magSource           object
dtype: object

In [15]:
# Check the unique values of all the columns having object type
for col in df.columns:
    if df[col].dtype == 'O': # Object dtype
        print(f'{col} Unique Values ->')
        print(df[col].unique())
        print(f'Number of Unique Values: {len(df[col].unique())}')
        print('\n')

time Unique Values ->
['2023-11-15T09:05:07.304Z' '2023-11-15T08:53:06.688Z'
 '2023-11-15T08:41:52.480Z' ... '2023-09-01T00:45:43.100Z'
 '2023-09-01T00:42:08.428Z' '2023-09-01T00:04:57.310Z']
Number of Unique Values: 19242


magType Unique Values ->
['ml' 'md' 'mb' 'mlv' 'ml(texnet)' 'mww' 'mb_lg' 'mwr' 'mw' 'mwb' 'Mi'
 'mh' 'Mb' 'ms_vx']
Number of Unique Values: 14


net Unique Values ->
['ak' 'hv' 'nc' 'us' 'ok' 'pr' 'tx' 'ci' 'mb' 'nn' 'av' 'uu' 'uw' 'nm'
 'se' 'pt' 'at']
Number of Unique Values: 17


id Unique Values ->
['ak023enptp4a' 'ak023enpiju4' 'hv73648602' ... 'pr71422843'
 'ak023b7jdp67' 'hv73555557']
Number of Unique Values: 19244


updated Unique Values ->
['2023-11-15T09:06:28.283Z' '2023-11-15T08:54:38.102Z'
 '2023-11-15T08:56:22.252Z' ... '2023-09-01T01:02:07.540Z'
 '2023-09-14T00:54:41.772Z' '2023-09-01T00:08:17.330Z']
Number of Unique Values: 19147


place Unique Values ->
['5 km SSW of Houston, Alaska' '55 km NE of Whittier, Alaska'
 '8 km SW of Volcano, Hawaii' ...

- `time`: Converted to datetime64[ns] to represent the timestamp.
- `latitude`, `longitude`, `depth`, `mag`, `nst`, `gap`, `dmin`, `rms`, `horizontalError`, `depthError`, `magError`, `magNst`: These columns seem to represent numerical values, so they are kept as float64.
- `id`, `place`: These columns seem to represent string values, so they are kept as object.
- `magType`, `net`, `type`, `status`, `locationSource`, `magSource` -> These columns seem to represent categorical so converting it to a categorical type.
- `updated`: Converted to datetime64[ns] to represent the timestamp.

In [16]:
# Convert 'time' and 'updated' to datetime64[ns]
df['time'] = pd.to_datetime(df['time'])
df['updated'] = pd.to_datetime(df['updated'])

# Convert categorical columns to categorical type
categorical_columns = ['magType', 'net', 'type', 'status', 'locationSource', 'magSource']
df[categorical_columns] = df[categorical_columns].astype('category')

# Display the updated datatypes
df.dtypes

time               datetime64[ns, UTC]
latitude                       float64
longitude                      float64
depth                          float64
mag                            float64
magType                       category
nst                            float64
gap                            float64
dmin                           float64
rms                            float64
net                           category
id                              object
updated            datetime64[ns, UTC]
place                           object
type                          category
horizontalError                float64
depthError                     float64
magError                       float64
magNst                         float64
status                        category
locationSource                category
magSource                     category
dtype: object

## 3. Remove unwanted data and handle missing values. 

In [17]:
# Check for Missing values
df.isna().sum()

time                  0
latitude              0
longitude             0
depth                 0
mag                   0
magType               0
nst                6616
gap                6618
dmin               8402
rms                   1
net                   0
id                    0
updated               0
place               972
type                  0
horizontalError    7553
depthError            1
magError           6688
magNst             6659
status                0
locationSource        0
magSource             0
dtype: int64

So there are some missing data in the following columns `nst`, `gap`, `dmin`, `rms`, `place`, `7553`, `deptherror`, `magError`, and `magNst`

In [18]:
# Impute missing values for numerical columns with their respective means
numerical_columns = ['nst', 'gap', 'dmin', 'rms', 'horizontalError', 'depthError', 'magError', 'magNst']
df[numerical_columns] = df[numerical_columns].apply(lambda x: x.fillna(x.mean()))

# Impute missing values for 'place' (categorical column) with the most frequent category
df['place'].fillna(df['place'].mode()[0], inplace=True)

# Display the updated DataFrame with no missing values
df.isnull().sum()

time               0
latitude           0
longitude          0
depth              0
mag                0
magType            0
nst                0
gap                0
dmin               0
rms                0
net                0
id                 0
updated            0
place              0
type               0
horizontalError    0
depthError         0
magError           0
magNst             0
status             0
locationSource     0
magSource          0
dtype: int64

In [19]:
# Since the Id column is of no significant information we are removing that column
df = df.drop(['id'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19244 entries, 0 to 19243
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   time             19244 non-null  datetime64[ns, UTC]
 1   latitude         19244 non-null  float64            
 2   longitude        19244 non-null  float64            
 3   depth            19244 non-null  float64            
 4   mag              19244 non-null  float64            
 5   magType          19244 non-null  category           
 6   nst              19244 non-null  float64            
 7   gap              19244 non-null  float64            
 8   dmin             19244 non-null  float64            
 9   rms              19244 non-null  float64            
 10  net              19244 non-null  category           
 11  updated          19244 non-null  datetime64[ns, UTC]
 12  place            19244 non-null  object             
 13  type            

## 4. Save the cleaned data as `quakes-cleaned.csv`.

In [27]:
# Save the dataset
df.to_csv(('quakes-cleaned.csv'), index=False)