In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

df=pd.read_csv("./data/230901_basic_table_for_analysis.csv", index_col=0, low_memory=False)

df.head()

df.shape

(2509598, 63)

In [2]:
#calculate missing values of each colum
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
percent_missing

NameError: name 'df' is not defined

In [1]:
#cleaning of the dataset regarding missing values

#delete columns id_vehicule, com, gps, voie, v1, v2, pr, pr1, lartpc, larrout, vma, env1, motor, secu1, 
#secu2, secu3, secu

df.drop(['id_vehicule', 'com', 'gps', 'voie', 'v1', 'v2',
         'pr', 'pr1', 'lartpc', 'larrout', 'vma', 'env1', 'motor', 
         'secu', 'secu1', 'secu2', 'secu3'], axis=1, inplace=True)




NameError: name 'df' is not defined

In [4]:
#replace NaN with -1 (coded as no information) in columns
#place, an_nais, trajet, locp, actp, etatp ,atm, col, circ, nbv, vosp, prof, plan, surf, infra
#situ, senc, occutc, obs, obsm, choc, manv

df.fillna({'place':'-1', 'an_nais':'-1', 'trajet':'-1', 'locp':'-1', 'actp':'-1', 
           'etatp':'-1', 'atm':'-1', 'col':'-1', 'circ':'-1', 
           'nbv':'-1', 'vosp':'-1', 'prof':'-1', 'plan':'-1', 
           'surf':'-1', 'infra':'-1', 'situ':'-1', 'senc':'-1', 'occutc':'-1', 
           'obs':'-1', 'obsm':'-1', 'choc':'-1', 'manv':'-1'}, inplace=True)

In [5]:
#remove lines with NaN in column catr

df.dropna(subset=['catr'], inplace=True)


In [6]:
#transform date to datetime
df['date'] =  pd.to_datetime(df['date'], format="%Y-%m-%d")

df['date'] = df['date'].dt.date

df['date'].describe()

count        2509596
unique          6209
top       2005-12-16
freq             832
Name: date, dtype: object

In [7]:
percent_missing_clean = df.isnull().sum() * 100 / len(df)
percent_missing_clean

num_veh          0.000000
place            0.000000
catu             0.000000
grav             0.000000
sexe             0.000000
an_nais          0.000000
trajet           0.000000
locp             0.000000
actp             0.000000
etatp            0.000000
an               0.000000
mois             0.000000
jour             0.000000
hrmn             0.000000
lum              0.000000
agg              0.000000
int              0.000000
atm              0.000000
col              0.000000
adr             13.695511
lat             42.471816
long            42.760428
dep              0.000000
metropolitan     0.000000
catr             0.000000
circ             0.000000
nbv              0.000000
vosp             0.000000
prof             0.000000
plan             0.000000
surf             0.000000
infra            0.000000
situ             0.000000
senc             0.000000
catv             0.000000
occutc           0.000000
obs              0.000000
obsm             0.000000
choc        

## Transform data types
### Date and time variables
The date and time variables are maybe important as grouping variables or as contrains for time dependent severity of accidents.

For the grouping, a timestamp is required for unbiased identification time related accidents.

_Problem_:

During the transformation of the `hrmn` variable, I got aware, that the string contain integers like `1`,  `801`, or `1300`. Hence, anytime during data conversion, the colon got lost and the values got truncated. Hence `1` should be `00:01` and so on. Therefore, we need an additional transformation of the truncated data to `h:m` format.

**Steps:**
1. Fixing the truncated values in `hrmn`
3. Creation of a `datatime` variable in format y-m-d hh:mm
4. Transformation of the datatime varible to a `timestamp` variable


In addition, we need to transform the type of `an_nais` to integer and `date` to `date`.

In [8]:
# Fixing the hrmn issue:
# Remove the colon
df['hrmn'] = df.apply(lambda x: re.sub(string=x['hrmn'], pattern=':', repl=''), axis = 1)

# Pad the string to four zeros
df['hrmn'] = df.apply(lambda x: x['hrmn'].zfill(4), axis = 1)

# Transform the variable to 'hh:mm' and split to hours and minutes
df = df.assign(hrmn = pd.to_datetime(df['hrmn'], format='%H%M').dt.strftime('%H:%M'))

# Create the daytime variable
df['datetime'] = df.apply(lambda x: datetime(x['an'], x['mois'], x['jour'], datetime.strptime(x['hrmn'], "%H:%M").hour, datetime.strptime(x['hrmn'], "%H:%M").minute), axis = 1)

# Create the timestamp
df['timestamp'] = df.apply(lambda x: datetime.timestamp(x['datetime']), axis = 1)

# Transform `an_nais`
df['an_nais'] = df['an_nais'].astype('int64')

# Transform `date`
df = df.assign(date = pd.to_datetime(df['date'], format='mixed'))

In [9]:
#save data to a new file
df.to_csv("./data/230921_basic_table_for_analysis_cleaned.csv", sep = ',', header = True, na_rep = 'n/a', index=True)