In [1]:
import numpy as np
import pandas as pd

In [2]:
location = r"E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\sales_data_types.csv"

In [3]:
df = pd.read_csv(location)
df.head()

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002.0,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y
1,552278.0,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y
2,23477.0,ACME Industrial,"$50,000.00",$62500.00,25.00%,125,3,29,2016,Y
3,24900.0,Brekke LTD,"$350,000.00",$490000.00,4.00%,75,10,27,2015,Y
4,651029.0,Harbor Co,"$15,000.00",$12750.00,-15.00%,Closed,2,2,2014,N


In [4]:
# Let’s try adding together the 2016 and 2017 sales:

df['2016'] + df['2017']

0      $125,000.00$162500.00
1    $920,000.00$101,2000.00
2        $50,000.00$62500.00
3      $350,000.00$490000.00
4        $15,000.00$12750.00
dtype: object

In [5]:
# A clue to the problem is the line that says dtype: object. 
# An object is a string in pandas so it performs a string operation instead of a mathematical one.

df.dtypes

Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

In [6]:
# After looking at the automatically assigned data types, there are several concerns:

# The Customer Number : is a float64 but it should be an int64
# The 2016 and 2017   : columns are stored as objects, not numerical values such as a float64 or 
#                       int64
# Percent Growth and Jan Units : also stored as objects not numerical values
# Month , Day and Year columns : should be converted to datetime64
# The Active column   : should be a boolean

In [7]:
# clean up these data types

In [8]:
# In order to convert data types in pandas, there are three basic options:

# Use astype() to force an appropriate dtype
# Create a custom function to convert the data
# Use pandas functions such as to_numeric() or to_datetime()

In [9]:
df['Customer Number'].astype('int')
df['Customer Number'].astype('int32')
df['Customer Number'].astype('int64')

0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: int64

In [10]:
# n order to actually change the customer number in the original dataframe, make sure to assign 
# it back since the astype() functions returns a copy.

df["Customer Number"] = df['Customer Number'].astype('int')
df.dtypes

Customer Number     int32
Customer Name      object
2016               object
2017               object
Percent Growth     object
Jan Units          object
Month               int64
Day                 int64
Year                int64
Active             object
dtype: object

In [11]:
# Let’s try to do the same thing to our 2016 column and convert it to a floating point number

In [12]:
df.head()

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y
1,552278,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y
2,23477,ACME Industrial,"$50,000.00",$62500.00,25.00%,125,3,29,2016,Y
3,24900,Brekke LTD,"$350,000.00",$490000.00,4.00%,75,10,27,2015,Y
4,651029,Harbor Co,"$15,000.00",$12750.00,-15.00%,Closed,2,2,2014,N


In [14]:
# does not work
df['2016'].astype('float')

ValueError: could not convert string to float: '$15,000.00'

In [15]:
# does not work, because of value 'closed' in this column
df['Jan Units'].astype('int')

ValueError: invalid literal for int() with base 10: 'Closed'

In [16]:
# In each of the cases, the data included values that could not be interpreted as numbers. 
# In the sales columns, the data includes a currency symbol as well as a comma in each value. 
# In the Jan Units columnm the last value is “Closed” which is not a number; 

# so we get the exception.

In [20]:
# We should give it one more try on the Active column.

df['Active'].astype('bool')

0    True
1    True
2    True
3    True
4    True
Name: Active, dtype: bool

In [21]:
# At first glance, this looks ok but upon closer inspection, there is a big problem. 
# All values were interpreted as True but the last customer has an Active flag of N so this does 
# not seem right.

In [22]:
# The takeaway is that astype() will only work if:

    # the data is clean and can be simply interpreted as a number
    # you want to convert a numeric value to a string object
    
# If the data has non-numeric characters or is not homogeneous, then astype() will work

# You will need to do additional transforms for the type change to work correctly.

In [23]:
# Custom Conversion Functions


In [24]:
# For currency conversion (of this specific data set)

def convert_currency(val):
    """
    Convert the string number value to a float
     - Remove $
     - Remove commas
     - Convert to float type
    """
    new_val = val.replace(',','').replace('$', '')
    return float(new_val)


In [25]:
# use the pandas apply function to apply this to all the values in the 2016 column.

df['2016'].apply(convert_currency)

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64

In [26]:
df['2016'] = df['2016'].apply(convert_currency)
df['2017'] = df['2017'].apply(convert_currency)

In [18]:
df.head()

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002,Quest Industries,125000.0,162500.0,30.00%,500,1,10,2015,Y
1,552278,Smith Plumbing,920000.0,1012000.0,10.00%,700,6,15,2014,Y
2,23477,ACME Industrial,50000.0,62500.0,25.00%,125,3,29,2016,Y
3,24900,Brekke LTD,350000.0,490000.0,4.00%,75,10,27,2015,Y
4,651029,Harbor Co,15000.0,12750.0,-15.00%,Closed,2,2,2014,N


In [19]:
def convert_percent(val):
    """
    Convert the percentage string to an actual floating point percent
    - Remove %
    - Divide by 100 to make decimal
    """
    new_val = val.replace('%', '')
    return float(new_val) / 100

In [20]:
df['Percent Growth'].apply(convert_percent)

0    0.30
1    0.10
2    0.25
3    0.04
4   -0.15
Name: Percent Growth, dtype: float64

In [21]:
# using np.where() to convert the active column to a boolean. 

# The basic idea is to use the np.where() function to convert all “Y” values to True and 
# everything else assigned False

df["Active"] = np.where(df["Active"] == "Y", True, False)