In [1]:
import pandas as pd

In [21]:
from datetime import datetime

# Data to be written to the CSV
data = {
    "Name": ["John Doe", "Jane Smith", "Emily Davis", "Mark Evans"],
    "Experience": [10, None, 2, 5],
    "Age": [35, 29, 27, 30],
    "Department": ["Engineering", "Marketing", "Product", "Sales"],
    "DepartmentID": ["001", "002", "003", "004"],
    "CreatedDatetime": [
        "2023-04-05T14:30",
        "2023-02-28T08:15",
        "2023-03-15T12:45",
        "2023-01-13T09:00"
    ],
    "CreatedDatetimeTZ": [
        "2023-04-05T14:30+08:00",
        "2023-02-28T08:15+08:00",
        "2023-03-15T12:45+08:00",
        "2023-01-13T09:00+08:00"
    ],
    "Remote": [True, False, True, False],
    "OnBoard": ["V", "X", "V", "V"],
    "Intern": [0, 0, 0, 1]
}


df = pd.DataFrame(data)
df.to_csv('dtype_sample.csv', index=False)


In [18]:
file_path = '/workspaces/demo-devcontainer-main/src/test/dtype_sample_wrong_dt_fmt.csv'

df = pd.read_csv(file_path)
df

Unnamed: 0,Name,Experience,Age,Department,DepartmentID,CreatedDatetime,CreatedDatetimeTZ,Remote,OnBoard,Intern
0,John Doe,10.0,35,Engineering,1,2023-04-05T14:30:00,2023-04-05T14:30+08:00,True,V,0
1,Jane Smith,,29,Marketing,2,2023-02-28T08:15:00,2023-02-28T08:15+08:00,False,X,0
2,Emily Davis,2.0,27,Product,3,2023-03-15T12:45:00,2023-03-15T12:45+08:00,True,V,0
3,Mark Evans,5.0,30,Sales,4,2023-01-13T09:00:00,2023-01-13T09:00+08:00,False,V,1


In [19]:
df.dtypes

Name                  object
Experience           float64
Age                    int64
Department            object
DepartmentID           int64
CreatedDatetime       object
CreatedDatetimeTZ     object
Remote                  bool
OnBoard               object
Intern                 int64
dtype: object

In [23]:
# setup every column's data type
columns_dtype = {
    'DepartmentID':str,
    'Remote':bool,
    'Intern':bool,
    'CreatedDatetime': str,
    'CreatedDatetimeTZ': str
}

# df = pd.read_csv(file_path,
#                  dtype=columns_dtype,
#                  parse_dates = ['CreatedDatetime', 'CreatedDatetimeTZ'],
#                  date_parser = lambda x: pd.to_datetime(x)
#                  )

df = pd.read_csv(file_path,
                 dtype=columns_dtype
                 )

# coerce: if can't revert to datetime, then return NaN
df['CreatedDatetime'] = pd.to_datetime(df['CreatedDatetime'], errors='coerce')
df['CreatedDatetimeTZ'] = pd.to_datetime(df['CreatedDatetimeTZ'])

df

Unnamed: 0,Name,Experience,Age,Department,DepartmentID,CreatedDatetime,CreatedDatetimeTZ,Remote,OnBoard,Intern
0,John Doe,10.0,35,Engineering,1,2023-04-05 14:30:00,2023-04-05 14:30:00+08:00,True,V,False
1,Jane Smith,,29,Marketing,2,2023-02-28 08:15:00,2023-02-28 08:15:00+08:00,False,X,False
2,Emily Davis,2.0,27,Product,3,2023-03-15 12:45:00,2023-03-15 12:45:00+08:00,True,V,False
3,Mark Evans,5.0,30,Sales,4,NaT,2023-01-13 09:00:00+08:00,False,V,True


In [17]:
df.dtypes

Name                                    object
Experience                             float64
Age                                      int64
Department                              object
DepartmentID                            object
CreatedDatetime                 datetime64[ns]
CreatedDatetimeTZ    datetime64[ns, UTC+08:00]
Remote                                    bool
OnBoard                                 object
Intern                                    bool
dtype: object

In [13]:
df['OnBoard'] = df['OnBoard'].apply(lambda x: True if x == 'V' else False)
df

Unnamed: 0,Name,Experience,Age,Department,DepartmentID,CreatedDatetime,CreatedDatetimeTZ,Remote,OnBoard,Intern
0,John Doe,10.0,35,Engineering,1,2023-04-05 14:30:00,2023-04-05T14:30+08:00,True,True,False
1,Jane Smith,,29,Marketing,2,2023-02-28 08:15:00,2023-02-28T08:15+08:00,False,False,False
2,Emily Davis,2.0,27,Product,3,2023-03-15 12:45:00,2023-03-15T12:45+08:00,True,True,False
3,Mark Evans,5.0,30,Sales,4,2023-01-13 09:00:00,2023-01-13T09:00+08:00,False,True,True


In [25]:
import numpy as np

df['Experience'] = np.where(
                    # if
                    df['Experience'].isna(),
                    # is True then:
                    0,
                    # is False then
                    df['Experience']
                    )

df

Unnamed: 0,Name,Experience,Age,Department,DepartmentID,CreatedDatetime,CreatedDatetimeTZ,Remote,OnBoard,Intern
0,John Doe,10.0,35,Engineering,1,2023-04-05 14:30:00,2023-04-05 14:30:00+08:00,True,V,False
1,Jane Smith,0.0,29,Marketing,2,2023-02-28 08:15:00,2023-02-28 08:15:00+08:00,False,X,False
2,Emily Davis,2.0,27,Product,3,2023-03-15 12:45:00,2023-03-15 12:45:00+08:00,True,V,False
3,Mark Evans,5.0,30,Sales,4,NaT,2023-01-13 09:00:00+08:00,False,V,True


In [26]:
df['OnBoard'] = np.where(
                    df['OnBoard'] == 'V',
                    True,
                    False
                )

df

Unnamed: 0,Name,Experience,Age,Department,DepartmentID,CreatedDatetime,CreatedDatetimeTZ,Remote,OnBoard,Intern
0,John Doe,10.0,35,Engineering,1,2023-04-05 14:30:00,2023-04-05 14:30:00+08:00,True,True,False
1,Jane Smith,0.0,29,Marketing,2,2023-02-28 08:15:00,2023-02-28 08:15:00+08:00,False,False,False
2,Emily Davis,2.0,27,Product,3,2023-03-15 12:45:00,2023-03-15 12:45:00+08:00,True,True,False
3,Mark Evans,5.0,30,Sales,4,NaT,2023-01-13 09:00:00+08:00,False,True,True
