# Introduction


dtypes are not native to pandas. They are a result of pandas close architectural coupling to
numpy.

the dtype of a column does not in any way have to correlate to the python type of the object
contained in the column.



In [1]:
import pandas as pd

pd.Series([1.,2.,3.,4.,5.]).astype(object)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: object

Here we have a pd.Series with floats. The dtype will be float.

Then we use astype to "cast" it to object

The dtype is now object, but the objects in the list are still float. Logical if you know that in python,
everything is an object, and can be upcasted to object.

In [3]:
type(pd.Series([1.,2.,3.,4.,5.]).astype(object)[0])

float

Float to strings

In [4]:
pd.Series([1.,2.,3.,4.,5.]).astype(str)


0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: object

In [5]:
type(pd.Series([1.,2.,3.,4.,5.]).astype(str)[0])

str

#### type of columns

In [6]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1.0, 2.0, 3.0], 'C': [True, False, True]})
df

Unnamed: 0,A,B,C
0,1,1.0,True
1,2,2.0,False
2,3,3.0,True


In [7]:
df.dtypes

A      int64
B    float64
C       bool
dtype: object

#### Changing data types of columns

In [9]:
 df = pd.DataFrame({'A': [1, 2, 3], 'B': [1.0, 2.0, 3.0],
 'C': ['1.1.2010', '2.1.2011', '3.1.2011'],
 'D': ['1 days', '2 days', '3 days'],
 'E': ['1', '2', '3']})

df.dtypes

A      int64
B    float64
C     object
D     object
E     object
dtype: object

In [10]:
df['A'].astype('float')

0    1.0
1    2.0
2    3.0
Name: A, dtype: float64

#### Changing the type to numeric


In [11]:
pd.to_numeric(df['E'])

0    1
1    2
2    3
Name: E, dtype: int64

##### NOTE: By default, pd.to_numeric raises an error if an input cannot be converted to a number. You can change that behavior by using the errors parameter.

In [12]:
# Ignore the error, return the original input if it cannot be converted
pd.to_numeric(pd.Series(['1', '2', 'a']), errors='ignore')


0    1
1    2
2    a
dtype: object

In [13]:
# Return NaN when the input cannot be converted to a number
pd.to_numeric(pd.Series(['1', '2', 'a']), errors='coerce')


0    1.0
1    2.0
2    NaN
dtype: float64

##### If need check all rows with input cannot be converted to numeric use boolean indexing with isnull:

In [14]:
 df = pd.DataFrame({'A': [1, 'x', 'z'],
 'B': [1.0, 2.0, 3.0],
 'C': [True, False, True]})
    
pd.to_numeric(df.A, errors='coerce').isnull()

0    False
1     True
2     True
Name: A, dtype: bool

In [15]:
df[pd.to_numeric(df.A, errors='coerce').isnull()]


Unnamed: 0,A,B,C
1,x,2.0,False
2,z,3.0,True


#### Changing the type to datetime

In [17]:
 pd.to_datetime('2.1.2011', dayfirst=True)

Timestamp('2011-01-02 00:00:00')

##### Note that 2.1.2011 is converted to February 1, 2011. If you want January 2, 2011 instead, you need to use the dayfirst parameter.

#### Changing the type to timedelta


### Selecting columns based on dtype
select_dtypes method can be used to select columns based on dtype.

In [21]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1.0, 2.0, 3.0], 'C': ['a', 'b', 'c'],
 'D': [True, False, True]})
df

Unnamed: 0,A,B,C,D
0,1,1.0,a,True
1,2,2.0,b,False
2,3,3.0,c,True


##### With include and exclude parameters you can specify which types you want:

In [22]:
 df.select_dtypes(include=['number']) # You need to use a list

Unnamed: 0,A,B
0,1,1.0
1,2,2.0
2,3,3.0


In [23]:
# Select numbers and booleans
df.select_dtypes(include=['number', 'bool'])

Unnamed: 0,A,B,D
0,1,1.0,True
1,2,2.0,False
2,3,3.0,True


In [24]:
# Select numbers and booleans but exclude int64
df.select_dtypes(include=['number', 'bool'], exclude=['int64'])

Unnamed: 0,B,D
0,1.0,True
1,2.0,False
2,3.0,True
