In [1]:
import pandas as pd
pd.set_option("display.max.columns", None)

In [2]:
pd.set_option("display.max.rows", 16)

In [3]:
pd.set_option("display.precision", 2)

# Text data types

There are two ways to store text data in pandas:

+ object -dtype NumPy array.

+ StringDtype extension type.

We recommend using **StringDtype** to store text data.

Prior to pandas 1.0, object dtype was the only option.

In [4]:
import pandas as pd
df = pd.read_csv("../data/cardata.csv")

In [5]:
df.head(3).T

Unnamed: 0,0,1,2
Make,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series
Year,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0
Engine Cylinders,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance"


In [6]:
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [7]:
df['Model'] = df['Model'].astype("string")

In [8]:
df.dtypes

Make                         object
Model                string[python]
Year                          int64
Engine Fuel Type             object
Engine HP                   float64
Engine Cylinders            float64
Transmission Type            object
Driven_Wheels                object
Number of Doors             float64
Market Category              object
Vehicle Size                 object
Vehicle Style                object
highway MPG                   int64
city mpg                      int64
Popularity                    int64
MSRP                          int64
dtype: object

###### Specify the dtype in importing

In [9]:
import pandas as pd
df = pd.read_csv("../data/cardata.csv", dtype=({'Make':"string", "Model":"string"}))

In [10]:
df.dtypes

Make                 string[python]
Model                string[python]
Year                          int64
Engine Fuel Type             object
Engine HP                   float64
Engine Cylinders            float64
Transmission Type            object
Driven_Wheels                object
Number of Doors             float64
Market Category              object
Vehicle Size                 object
Vehicle Style                object
highway MPG                   int64
city mpg                      int64
Popularity                    int64
MSRP                          int64
dtype: object

# String methods

In [11]:
df.iloc[:5,:8]

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive


In [12]:
df['Transmission Type'] = df['Transmission Type'].str.lower()

In [13]:
df['Engine Fuel Type'] = df['Engine Fuel Type'].str.title()

In [14]:
df.iloc[:5,:8]

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels
0,BMW,1 Series M,2011,Premium Unleaded (Required),335.0,6.0,manual,rear wheel drive
1,BMW,1 Series,2011,Premium Unleaded (Required),300.0,6.0,manual,rear wheel drive
2,BMW,1 Series,2011,Premium Unleaded (Required),300.0,6.0,manual,rear wheel drive
3,BMW,1 Series,2011,Premium Unleaded (Required),230.0,6.0,manual,rear wheel drive
4,BMW,1 Series,2011,Premium Unleaded (Required),230.0,6.0,manual,rear wheel drive


In [15]:
df.loc[0,'Make'] = ' BMW'
df.loc[1,'Make'] = ' BMW '
df.loc[2,'Make'] = ' BMW '

df.loc[0,'Make'],df.loc[1,'Make'],df.loc[1,'Make']

(' BMW', ' BMW ', ' BMW ')

In [16]:
df['Make'] = df['Make'].str.lstrip()

In [17]:
df.loc[0,'Make'],df.loc[1,'Make'],df.loc[1,'Make']

('BMW', 'BMW ', 'BMW ')

In [18]:
df['Make'] = df['Make'].str.rstrip()

In [19]:
df.loc[0,'Make'],df.loc[1,'Make'],df.loc[1,'Make']

('BMW', 'BMW', 'BMW')

###### Pattern matching

In [20]:
df.loc[df['Engine Fuel Type'].str.contains("Regular")==True, 
       ['Make','Model','Year','Engine Fuel Type']].head(5)

Unnamed: 0,Make,Model,Year,Engine Fuel Type
17,Audi,100,1992,Regular Unleaded
18,Audi,100,1992,Regular Unleaded
19,Audi,100,1992,Regular Unleaded
20,Audi,100,1992,Regular Unleaded
21,Audi,100,1992,Regular Unleaded


In [21]:
df.iloc[:5,[1,2,3,7]]

Unnamed: 0,Model,Year,Engine Fuel Type,Driven_Wheels
0,1 Series M,2011,Premium Unleaded (Required),rear wheel drive
1,1 Series,2011,Premium Unleaded (Required),rear wheel drive
2,1 Series,2011,Premium Unleaded (Required),rear wheel drive
3,1 Series,2011,Premium Unleaded (Required),rear wheel drive
4,1 Series,2011,Premium Unleaded (Required),rear wheel drive


In [22]:
df["Driven_Wheels"]= df["Driven_Wheels"].str.replace(
                        "wheel drive", "").str.strip()

In [23]:
df.iloc[:5,[1,2,3,7]]

Unnamed: 0,Model,Year,Engine Fuel Type,Driven_Wheels
0,1 Series M,2011,Premium Unleaded (Required),rear
1,1 Series,2011,Premium Unleaded (Required),rear
2,1 Series,2011,Premium Unleaded (Required),rear
3,1 Series,2011,Premium Unleaded (Required),rear
4,1 Series,2011,Premium Unleaded (Required),rear


In [24]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [25]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [26]:
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [27]:
df[:3]

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,BMW,1 Series M,2011,Premium Unleaded (Required),335.0,6.0,manual,rear,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,Premium Unleaded (Required),300.0,6.0,manual,rear,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,Premium Unleaded (Required),300.0,6.0,manual,rear,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
