# Pandas Introduction By an Example
A dataset provided by FDA has a column "event_date_initiated". 
This is the date when a medical device recall is initiated.
The file event_date_initiated.csv contains just this column.

In [94]:
import pandas as pd

## Read the file into a Pandas dataframe.

In [95]:
df = pd.read_csv("event_date_initiated.csv")

## Find out what type of object the date_initiated is 
It is of type "object" which means it is a string/text.

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43749 entries, 0 to 43748
Data columns (total 1 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event_date_initiated  43749 non-null  object
dtypes: object(1)
memory usage: 341.9+ KB


## Find out the number of rows

In [97]:
df.shape

(43749, 1)

## Display the first, last five, random 5 rows

In [98]:
df.head()

Unnamed: 0,event_date_initiated
0,2002-12-26
1,2003-03-25
2,2003-03-25
3,2004-01-27
4,2003-12-10


In [99]:
df.tail()

Unnamed: 0,event_date_initiated
43744,2020-09-09
43745,2020-07-06
43746,2020-01-17
43747,2020-05-28
43748,2020-08-20


In [100]:
df.sample(5)

Unnamed: 0,event_date_initiated
41455,2012-02-24
16277,2010-09-03
64,2009-11-12
15988,2004-10-27
39817,2007-09-13


## Extract year from the date

In [101]:
df["year"] = df["event_date_initiated"].str[:4]
df.head()

Unnamed: 0,event_date_initiated,year
0,2002-12-26,2002
1,2003-03-25,2003
2,2003-03-25,2003
3,2004-01-27,2004
4,2003-12-10,2003


## Get unique years using set() function

In [102]:
year_set = set(df["year"])
len(year_set)

26

In [103]:
year_set

{'0010',
 '0012',
 '0013',
 '1997',
 '1998',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020'}

## Replace the incorrect year 0011, 0012, 0013 with 2011, 2012, 2013

In [104]:
def transform_init_date(init_date):
    
    if init_date.startswith("00"):
        return init_date.replace("00", "20", 1) 
    else:
        return init_date
    

df["event_date_initiated2"] = df["event_date_initiated"].apply(transform_init_date)

## How do you know the changes were successful?

### Method one

In [105]:
df["year2"] = df["event_date_initiated2"].str[:4]
year_set2 = set(df["year2"])
len(year_set2)

23

In [106]:
year_set2

{'1997',
 '1998',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020'}

### Method two

In [107]:
df[df["event_date_initiated"] != df["event_date_initiated2"]]

Unnamed: 0,event_date_initiated,year,event_date_initiated2,year2
2290,0012-12-06,12,2012-12-06,2012
2344,0013-11-26,13,2013-11-26,2013
2432,0012-11-30,12,2012-11-30,2012
5267,0013-05-16,13,2013-05-16,2013
6045,0013-03-05,13,2013-03-05,2013
6801,0013-05-16,13,2013-05-16,2013
19636,0012-12-13,12,2012-12-13,2012
19910,0013-04-12,13,2013-04-12,2013
25453,0013-03-05,13,2013-03-05,2013
27812,0013-11-25,13,2013-11-25,2013


## Saved the corrected data to a file

In [108]:
df["event_date_initiated2"].to_csv("event_date_initiated_corrected.csv", index=False)

# The End!