### Example : Pandas Data Type Conversion

In [1]:
import pandas as pd
import numpy as np

In [2]:
sales_data = pd.read_csv("https://github.com/vannads/rd46_ml2018/blob/master/pandasNotes/sales_data.csv?raw=True")

In [3]:
sales_data.sample(5)

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
4,651029.0,Harbor Co,"$15,000.00",$12750.00,-15.00%,Closed,2,2,2014,N
2,23477.0,ACME Industrial,"$50,000.00",$62500.00,25.00%,125,3,29,2016,Y
1,552278.0,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y
3,24900.0,Brekke LTD,"$350,000.00",$490000.00,4.00%,75,10,27,2015,Y
0,10002.0,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y


In [4]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
Customer Number    5 non-null float64
Customer Name      5 non-null object
2016               5 non-null object
2017               5 non-null object
Percent Growth     5 non-null object
Jan Units          5 non-null object
Month              5 non-null int64
Day                5 non-null int64
Year               5 non-null int64
Active             5 non-null object
dtypes: float64(1), int64(3), object(6)
memory usage: 480.0+ bytes


In [5]:
sales_data.dtypes

Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

In [6]:
sales_data['Customer Number'] = sales_data['Customer Number'].astype('int')

In [7]:
sales_data.dtypes

Customer Number     int64
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object

In [8]:
def convert_currency(var):
    '''
    * Remove $
    * Remove commas
    * Convert to float
    '''
    return float(var.replace('$', '').replace(',', ''))

In [9]:
def convert_percentage(var):
    '''
    * Remove %
    * Convert to float
    * calculate %
    '''
    return float(var.replace('%', ''))/100

In [10]:
sales_data['2016'] = sales_data['2016'].apply(convert_currency)
sales_data['2017'] = sales_data['2017'].apply(convert_currency)
sales_data.sample(5)

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
2,23477,ACME Industrial,50000.0,62500.0,25.00%,125,3,29,2016,Y
0,10002,Quest Industries,125000.0,162500.0,30.00%,500,1,10,2015,Y
1,552278,Smith Plumbing,920000.0,1012000.0,10.00%,700,6,15,2014,Y
4,651029,Harbor Co,15000.0,12750.0,-15.00%,Closed,2,2,2014,N
3,24900,Brekke LTD,350000.0,490000.0,4.00%,75,10,27,2015,Y


In [11]:
# Using lambda function

#sales_data['2016'] = sales_data['2016'].apply(lambda x: x.replace('$', '').replace(',', '')).astype('float')


In [12]:
sales_data.sample(5)

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002,Quest Industries,125000.0,162500.0,30.00%,500,1,10,2015,Y
4,651029,Harbor Co,15000.0,12750.0,-15.00%,Closed,2,2,2014,N
3,24900,Brekke LTD,350000.0,490000.0,4.00%,75,10,27,2015,Y
2,23477,ACME Industrial,50000.0,62500.0,25.00%,125,3,29,2016,Y
1,552278,Smith Plumbing,920000.0,1012000.0,10.00%,700,6,15,2014,Y


In [13]:
sales_data['Percent Growth'] = sales_data['Percent Growth'].apply(convert_percentage)

In [14]:
sales_data.sample(5)

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
2,23477,ACME Industrial,50000.0,62500.0,0.25,125,3,29,2016,Y
3,24900,Brekke LTD,350000.0,490000.0,0.04,75,10,27,2015,Y
0,10002,Quest Industries,125000.0,162500.0,0.3,500,1,10,2015,Y
4,651029,Harbor Co,15000.0,12750.0,-0.15,Closed,2,2,2014,N
1,552278,Smith Plumbing,920000.0,1012000.0,0.1,700,6,15,2014,Y


In [15]:
sales_data['Jan Units'] = pd.to_numeric(sales_data['Jan Units'], errors='coerce').fillna(0)

In [16]:
sales_data.sample(5)

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
4,651029,Harbor Co,15000.0,12750.0,-0.15,0.0,2,2,2014,N
3,24900,Brekke LTD,350000.0,490000.0,0.04,75.0,10,27,2015,Y
1,552278,Smith Plumbing,920000.0,1012000.0,0.1,700.0,6,15,2014,Y
0,10002,Quest Industries,125000.0,162500.0,0.3,500.0,1,10,2015,Y
2,23477,ACME Industrial,50000.0,62500.0,0.25,125.0,3,29,2016,Y


In [17]:
sales_data['Active'] = sales_data['Active'].astype('bool')

In [18]:
sales_data['Start Date'] = pd.to_datetime(sales_data[['Month', 'Day', 'Year']])

In [19]:
sales_data.sample(5)

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active,Start Date
2,23477,ACME Industrial,50000.0,62500.0,0.25,125.0,3,29,2016,True,2016-03-29
3,24900,Brekke LTD,350000.0,490000.0,0.04,75.0,10,27,2015,True,2015-10-27
0,10002,Quest Industries,125000.0,162500.0,0.3,500.0,1,10,2015,True,2015-01-10
1,552278,Smith Plumbing,920000.0,1012000.0,0.1,700.0,6,15,2014,True,2014-06-15
4,651029,Harbor Co,15000.0,12750.0,-0.15,0.0,2,2,2014,True,2014-02-02


In [20]:
sales_data.dtypes

Customer Number             int64
Customer Name              object
2016                      float64
2017                      float64
Percent Growth            float64
Jan Units                 float64
Month                       int64
Day                         int64
Year                        int64
Active                       bool
Start Date         datetime64[ns]
dtype: object