In [1]:
"""
Company XYZ recently migrated database systems causing some of the date_joined records to be NULL. 
You're told by an analyst in human resources NULL records for the date_joined field indicates the 
employees joined prior to 2010. You also find out there are multiple employees with the same name 
and duplicate records for some employees.

Given this, write code to find the number of employees that joined each month. You can group all 
of the null values as Dec 1, 2009.

"""
import pandas as pd
from datetime import datetime
from dateutil.parser import parse
import numpy as np
raw_data = {'employee_name': ['Andy', 'Beth', 'Cindy', "Dale"],
            'employee_id': [123456,789456,654123,963852],
            'date_joined': ['2015-02-15', np.nan, '2017-05-16', "2018-01-15"],
            'age': [45,36,34,25],
            'yrs_of_experience': [24,14,14,4]}
df = pd.DataFrame(raw_data, columns = ['employee_name', 'employee_id', 'date_joined','age', 'yrs_of_experience'])
df

Unnamed: 0,employee_name,employee_id,date_joined,age,yrs_of_experience
0,Andy,123456,2015-02-15,45,24
1,Beth,789456,,36,14
2,Cindy,654123,2017-05-16,34,14
3,Dale,963852,2018-01-15,25,4


In [2]:
df.fillna('2009-12-01', inplace=True)
df

Unnamed: 0,employee_name,employee_id,date_joined,age,yrs_of_experience
0,Andy,123456,2015-02-15,45,24
1,Beth,789456,2009-12-01,36,14
2,Cindy,654123,2017-05-16,34,14
3,Dale,963852,2018-01-15,25,4


In [3]:
df.date_joined = pd.to_datetime(df.date_joined)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   employee_name      4 non-null      object        
 1   employee_id        4 non-null      int64         
 2   date_joined        4 non-null      datetime64[ns]
 3   age                4 non-null      int64         
 4   yrs_of_experience  4 non-null      int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 288.0+ bytes


In [4]:
df.set_index('date_joined', drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4 entries, 2015-02-15 to 2018-01-15
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   employee_name      4 non-null      object
 1   employee_id        4 non-null      int64 
 2   age                4 non-null      int64 
 3   yrs_of_experience  4 non-null      int64 
dtypes: int64(3), object(1)
memory usage: 160.0+ bytes


In [5]:
df.resample('M').agg('count')

Unnamed: 0_level_0,employee_name,employee_id,age,yrs_of_experience
date_joined,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-31,1,1,1,1
2010-01-31,0,0,0,0
2010-02-28,0,0,0,0
2010-03-31,0,0,0,0
2010-04-30,0,0,0,0
...,...,...,...,...
2017-09-30,0,0,0,0
2017-10-31,0,0,0,0
2017-11-30,0,0,0,0
2017-12-31,0,0,0,0
