### Infos du csv

In [1]:
import pandas as pd

df = pd.read_csv('csv/earthquakes.csv')

print('Shape:', df.shape)

pd.set_option('display.max_columns', None)

df.head()

Shape: (3272774, 22)


Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,1970-01-01T00:00:00.0Z,37.003502,-117.996834,0.0,0.0,mh,0.0,,,,ci,ci37038459,2016-04-02T20:22:05.312Z,"29km NE of Independence, CA",sonic boom,,,,0.0,reviewed,ci,ci
1,1970-01-01T00:00:00.0Z,35.642788,-120.933601,5.0,1.99,mh,2.0,,,,ci,ci11092098,2016-01-29T01:43:14.870Z,"11km SSW of Lake Nacimiento, CA",earthquake,,,,0.0,reviewed,ci,ci
2,1970-01-01T00:00:00.0Z,34.16452,-118.185036,0.0,0.0,mh,,,,,ci,ci15086796,2016-04-02T17:20:31.235Z,"4km S of La Canada Flintridge, CA",earthquake,,,,0.0,reviewed,ci,ci
3,1970-01-01T00:00:00.0Z,33.836494,-116.781868,0.0,0.0,mh,,,,,ci,ci14891508,2016-04-02T14:10:48.389Z,"9km S of Cabazon, CA",sonic boom,,,,0.0,reviewed,ci,ci
4,1970-01-01T00:00:00.0Z,33.208477,-115.476997,5.0,0.0,mh,,,,,ci,ci10925125,2016-04-02T04:32:22.103Z,"5km SE of Niland, CA",sonic boom,,,,0.0,reviewed,ci,ci


### Doublons ?

In [2]:
# Check for strictly identical rows
num_duplicates = df.duplicated().sum()
print(f'Number of strictly identical rows: {num_duplicates}')

if num_duplicates > 0:
    print('Duplicate rows:')
    print(df[df.duplicated()])
else:
    print('No duplicate rows found.')

Number of strictly identical rows: 15819
Duplicate rows:
                             time   latitude   longitude   depth   mag  \
188      1970-03-01T00:49:06.230Z  33.122000 -117.654500   6.000  2.98   
190      1970-03-01T04:14:14.310Z  35.406500 -117.954500   6.000  2.54   
192      1970-03-01T05:44:23.110Z  33.987833 -118.430833   6.000  3.44   
194      1970-03-01T10:51:11.580Z  46.881000 -119.424167  -0.261  2.30   
196      1970-03-02T12:33:40.480Z  46.876500 -119.422500   2.729  1.40   
...                           ...        ...         ...     ...   ...   
3264767  2019-03-02T23:31:11.801Z  60.514900 -143.030400   3.900  1.10   
3264769  2019-03-02T23:35:32.280Z  33.747500 -116.709167  10.860  0.83   
3264771  2019-03-02T23:38:13.802Z  61.348700 -150.078600  40.700  0.80   
3264773  2019-03-02T23:55:03.450Z  37.625667 -119.019333   1.510  0.31   
3264775  2019-03-02T23:57:59.740Z  46.611500 -119.867000   6.190  0.06   

        magType   nst    gap      dmin   rms net      

### Types de variables avant conversion

In [3]:
df = pd.read_csv('csv/earthquakes.csv')
df.dtypes

time                object
latitude           float64
longitude          float64
depth              float64
mag                float64
magType             object
nst                float64
gap                float64
dmin               float64
rms                float64
net                 object
id                  object
updated             object
place               object
type                object
horizontalError    float64
depthError         float64
magError           float64
magNst             float64
status              object
locationSource      object
magSource           object
dtype: object

### Conversion des variables


In [6]:
df = pd.read_csv('csv/earthquakes.csv')

# tout convertir
df = df.convert_dtypes()

# puis les dates
date_columns = ['time', 'updated']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], utc=True, errors='coerce')

df.to_parquet('csv/earthquakes.parquet')

In [7]:
df = pd.read_parquet('csv/earthquakes.parquet')
df.dtypes

time               datetime64[ns, UTC]
latitude                       Float64
longitude                      Float64
depth                          Float64
mag                            Float64
magType                 string[python]
nst                              Int64
gap                            Float64
dmin                           Float64
rms                            Float64
net                     string[python]
id                      string[python]
updated            datetime64[ns, UTC]
place                   string[python]
type                    string[python]
horizontalError                Float64
depthError                     Float64
magError                       Float64
magNst                           Int64
status                  string[python]
locationSource          string[python]
magSource               string[python]
dtype: object

### VÃ©rification conversion des dates

In [11]:
df = pd.read_csv('csv/earthquakes.csv')

# Compter les valeurs vides dans 'time' et 'updated'
time_nulls = df['time'].isnull().sum()
updated_nulls = df['updated'].isnull().sum()

print(f"Nombre de valeurs vides dans 'time': {time_nulls}")
print(f"Nombre de valeurs vides dans 'updated': {updated_nulls}")

Nombre de valeurs vides dans 'time': 0
Nombre de valeurs vides dans 'updated': 0


In [12]:
df = pd.read_parquet('csv/earthquakes.parquet')

# Compter les valeurs vides dans 'time' et 'updated'
time_nulls = df['time'].isnull().sum()
updated_nulls = df['updated'].isnull().sum()

print(f"Nombre de valeurs vides dans 'time': {time_nulls}")
print(f"Nombre de valeurs vides dans 'updated': {updated_nulls}")

Nombre de valeurs vides dans 'time': 0
Nombre de valeurs vides dans 'updated': 0
