In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [2]:
tips = sns.load_dataset('tips') 

In [3]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
categoricals = ['smoker', 'day', 'time']

In [6]:
X = tips.copy()

In [7]:
for c in categoricals:
    X[c] = X[c].cat.codes

In [8]:
X


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,1,3,1,2
1,10.34,1.66,Male,1,3,1,3
2,21.01,3.50,Male,1,3,1,3
3,23.68,3.31,Male,1,3,1,2
4,24.59,3.61,Female,1,3,1,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,1,2,1,3
240,27.18,2.00,Female,0,2,1,2
241,22.67,2.00,Male,0,2,1,2
242,17.82,1.75,Male,1,2,1,2


In [9]:
# Problems here:
# The values here indicate weights

# Solution
# use a code that does not indicate weight
# one hot encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
onehot = OneHotEncoder(drop = 'first')
data = [['Male', 1], ['Female', 3], ['Female', 2]]
onehot.fit_transform(data).toarray()
onehot.get_feature_names(['gender', 'group'])



array(['gender_Male', 'group_2', 'group_3'], dtype=object)

In [12]:
# problem:
# need to indicate what are the feature names
# sometimes we might have numeric data, which model will not be able to skip

In [13]:
from sklearn.feature_extraction import DictVectorizer
# input is a dict

In [14]:
measurements = [{'city' : 'Dubai', 'temperature' : 33.},
              {'city' : 'London', 'temperature' : 12.},
              {'city' : 'San Francisco', 'temperature' : 18.},]

In [15]:
dv = DictVectorizer()
dv.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [16]:
dv.get_feature_names()

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

In [17]:
tips_dict = tips.to_dict(orient = 'records')
tips_dict
tips_vec = dv.fit_transform(tips_dict).toarray()

In [18]:
tips_vec

array([[ 0.  ,  0.  ,  1.  , ...,  0.  ,  1.01, 16.99],
       [ 0.  ,  0.  ,  1.  , ...,  0.  ,  1.66, 10.34],
       [ 0.  ,  0.  ,  1.  , ...,  0.  ,  3.5 , 21.01],
       ...,
       [ 0.  ,  1.  ,  0.  , ...,  0.  ,  2.  , 22.67],
       [ 0.  ,  1.  ,  0.  , ...,  0.  ,  1.75, 17.82],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  3.  , 18.78]])

In [19]:
dv.get_feature_names()

['day=Fri',
 'day=Sat',
 'day=Sun',
 'day=Thur',
 'sex=Female',
 'sex=Male',
 'size',
 'smoker=No',
 'smoker=Yes',
 'time=Dinner',
 'time=Lunch',
 'tip',
 'total_bill']

In [20]:
onehot = OneHotEncoder(drop = 'first')
onehot.fit_transform(tips[categoricals]).toarray()

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [1., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [21]:
onehot.get_feature_names(categoricals)



array(['smoker_Yes', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Lunch'],
      dtype=object)

In [22]:
np.concatenate( (
    tips.select_dtypes(exclude='category').to_numpy(),
    onehot.fit_transform(tips[categoricals]).toarray()
), axis =1
)

array([[16.99,  1.01,  2.  , ...,  1.  ,  0.  ,  0.  ],
       [10.34,  1.66,  3.  , ...,  1.  ,  0.  ,  0.  ],
       [21.01,  3.5 ,  3.  , ...,  1.  ,  0.  ,  0.  ],
       ...,
       [22.67,  2.  ,  2.  , ...,  0.  ,  0.  ,  0.  ],
       [17.82,  1.75,  2.  , ...,  0.  ,  0.  ,  0.  ],
       [18.78,  3.  ,  2.  , ...,  0.  ,  1.  ,  0.  ]])

In [23]:
# pandas dummies
x = pd.get_dummies(tips, drop_first=True)
x

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.50,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,0,1,0,1
240,27.18,2.00,2,1,0,0,1,0,1
241,22.67,2.00,2,0,0,0,1,0,1
242,17.82,1.75,2,0,1,0,1,0,1


# Label Encoding


In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [25]:
le.fit_transform(list('ababcabc'))

array([0, 1, 0, 1, 2, 0, 1, 2], dtype=int64)

In [26]:
le.classes_

array(['a', 'b', 'c'], dtype='<U1')

In [27]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit_transform(list('ababcabcdb'))

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

# Caveats and Pitfalls

In [28]:
# Ordinals- Anything that is ordered and categorical 
from random import shuffle
severity = 'low medium high'.split()
s = severity*2
shuffle(s)
s= pd.Series(s)
s

0      high
1       low
2    medium
3    medium
4       low
5      high
dtype: object

In [29]:
c = pd.Categorical(s, ordered=True, categories = severity)

In [30]:
c

['high', 'low', 'medium', 'medium', 'low', 'high']
Categories (3, object): ['low' < 'medium' < 'high']

In [31]:
c.codes # ) might cancel the impact of low

array([2, 0, 1, 1, 0, 2], dtype=int8)

In [32]:
c.codes + 1

array([3, 1, 2, 2, 1, 3], dtype=int8)

In [33]:
# adult income dataset
columns = ''' age workclass fnlwgt education education-num marital-status occupation relationship race sex
                capital-gain capital-loss hours-per-week native-country income'''.split()

In [34]:
columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [35]:
income = pd.read_csv('adult.data', sep = ', ', engine='python', names = columns, na_values='?')
income.dropna(inplace = True)
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [36]:
from IPython.display import display, HTML

with pd.option_context('display.max_colwidth', 0):
    display(income.select_dtypes('O').apply(lambda s: ', '.join(s.unique())).reset_index())

Unnamed: 0,index,0
0,workclass,"State-gov, Self-emp-not-inc, Private, Federal-gov, Local-gov, Self-emp-inc, Without-pay"
1,education,"Bachelors, HS-grad, 11th, Masters, 9th, Some-college, Assoc-acdm, 7th-8th, Doctorate, Assoc-voc, Prof-school, 5th-6th, 10th, Preschool, 12th, 1st-4th"
2,marital-status,"Never-married, Married-civ-spouse, Divorced, Married-spouse-absent, Separated, Married-AF-spouse, Widowed"
3,occupation,"Adm-clerical, Exec-managerial, Handlers-cleaners, Prof-specialty, Other-service, Sales, Transport-moving, Farming-fishing, Machine-op-inspct, Tech-support, Craft-repair, Protective-serv, Armed-Forces, Priv-house-serv"
4,relationship,"Not-in-family, Husband, Wife, Own-child, Unmarried, Other-relative"
5,race,"White, Black, Asian-Pac-Islander, Amer-Indian-Eskimo, Other"
6,sex,"Male, Female"
7,native-country,"United-States, Cuba, Jamaica, India, Mexico, Puerto-Rico, Honduras, England, Canada, Germany, Iran, Philippines, Poland, Columbia, Cambodia, Thailand, Ecuador, Laos, Taiwan, Haiti, Portugal, Dominican-Republic, El-Salvador, France, Guatemala, Italy, China, South, Japan, Yugoslavia, Peru, Outlying-US(Guam-USVI-etc), Scotland, Trinadad&Tobago, Greece, Nicaragua, Vietnam, Hong, Ireland, Hungary, Holand-Netherlands"
8,income,"<=50K, >50K"


In [37]:
def show_categories(df):
    from IPython.display import display, HTML
    with pd.option_context('display.max_colwidth', 0):
        display(df.select_dtypes('O').apply(lambda s: ', '.join(s.unique())).reset_index())

In [38]:
show_categories(income)

Unnamed: 0,index,0
0,workclass,"State-gov, Self-emp-not-inc, Private, Federal-gov, Local-gov, Self-emp-inc, Without-pay"
1,education,"Bachelors, HS-grad, 11th, Masters, 9th, Some-college, Assoc-acdm, 7th-8th, Doctorate, Assoc-voc, Prof-school, 5th-6th, 10th, Preschool, 12th, 1st-4th"
2,marital-status,"Never-married, Married-civ-spouse, Divorced, Married-spouse-absent, Separated, Married-AF-spouse, Widowed"
3,occupation,"Adm-clerical, Exec-managerial, Handlers-cleaners, Prof-specialty, Other-service, Sales, Transport-moving, Farming-fishing, Machine-op-inspct, Tech-support, Craft-repair, Protective-serv, Armed-Forces, Priv-house-serv"
4,relationship,"Not-in-family, Husband, Wife, Own-child, Unmarried, Other-relative"
5,race,"White, Black, Asian-Pac-Islander, Amer-Indian-Eskimo, Other"
6,sex,"Male, Female"
7,native-country,"United-States, Cuba, Jamaica, India, Mexico, Puerto-Rico, Honduras, England, Canada, Germany, Iran, Philippines, Poland, Columbia, Cambodia, Thailand, Ecuador, Laos, Taiwan, Haiti, Portugal, Dominican-Republic, El-Salvador, France, Guatemala, Italy, China, South, Japan, Yugoslavia, Peru, Outlying-US(Guam-USVI-etc), Scotland, Trinadad&Tobago, Greece, Nicaragua, Vietnam, Hong, Ireland, Hungary, Holand-Netherlands"
8,income,"<=50K, >50K"


In [39]:
levels = ['Preschool', '1st-4th', '5th-6th', '7th-8th',
         '9th', '10th', '11th', '12th', 'HS-grad',
         'Some-college', 'Assoc-voc', 'Assoc-acdm',
         'Bachelors', 'Masters',
         'Prof-school',
         'Doctorate']

In [40]:
edu_cat = pd.Categorical(income.education, ordered = True, categories = levels)
edu_cat

['Bachelors', 'Bachelors', 'HS-grad', '11th', 'Bachelors', ..., 'Assoc-acdm', 'HS-grad', 'HS-grad', 'HS-grad', 'HS-grad']
Length: 30162
Categories (16, object): ['Preschool' < '1st-4th' < '5th-6th' < '7th-8th' ... 'Bachelors' < 'Masters' < 'Prof-school' < 'Doctorate']

In [41]:
edu_cat.codes + 1

array([13, 13,  9, ...,  9,  9,  9], dtype=int8)

In [42]:
edu = income[['education', 'education-num']].drop_duplicates()

In [43]:
edu

Unnamed: 0,education,education-num
0,Bachelors,13
2,HS-grad,9
3,11th,7
5,Masters,14
6,9th,5
10,Some-college,10
13,Assoc-acdm,12
15,7th-8th,4
20,Doctorate,16
48,Assoc-voc,11


In [44]:
# Auto-Mpg dataset
mpg = sns.load_dataset('mpg')
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [45]:
mpg['model_year'].value_counts()

73    40
78    36
76    34
82    31
75    30
70    29
79    29
80    29
81    29
71    28
72    28
77    28
74    27
Name: model_year, dtype: int64

In [46]:
mpg.model_year = mpg.model_year.astype(str)
pd.get_dummies(mpg, drop_first = True)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year_71,model_year_72,model_year_73,model_year_74,...,name_volvo 145e (sw),name_volvo 244dl,name_volvo 245,name_volvo 264gl,name_volvo diesel,name_vw dasher (diesel),name_vw pickup,name_vw rabbit,name_vw rabbit c (diesel),name_vw rabbit custom
0,18.0,8,307.0,130.0,3504,12.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,8,350.0,165.0,3693,11.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,8,318.0,150.0,3436,11.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,8,304.0,150.0,3433,12.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,8,302.0,140.0,3449,10.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
394,44.0,4,97.0,52.0,2130,24.6,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
395,32.0,4,135.0,84.0,2295,11.6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
396,28.0,4,120.0,79.0,2625,18.6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
headers = '''symboling normalized_losses make fuel_type aspiration num_doors
          body_style drive_wheels engine_location wheel_base length width
          height curb_weight engine_type num_cylinders engine_size
          fuel_system bore stroke compression_ratio horsepower peak_rpm
          city_mpg highway_mpg price'''.split()
# Read in the CSV file and convert "?" to NaN
autos = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
autos.dropna(inplace=True)
autos.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101.0,5800.0,23,29,16430.0


In [48]:
autos.select_dtypes('O')

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
6,audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi
8,audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi
10,bmw,gas,std,two,sedan,rwd,front,ohc,four,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,volvo,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,volvo,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,volvo,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


In [49]:
show_categories(autos)

Unnamed: 0,index,0
0,make,"audi, bmw, chevrolet, dodge, honda, jaguar, mazda, mercedes-benz, mitsubishi, nissan, peugot, plymouth, porsche, saab, subaru, toyota, volkswagen, volvo"
1,fuel_type,"gas, diesel"
2,aspiration,"std, turbo"
3,num_doors,"four, two"
4,body_style,"sedan, hatchback, wagon, hardtop, convertible"
5,drive_wheels,"fwd, 4wd, rwd"
6,engine_location,front
7,engine_type,"ohc, l, dohc, ohcv, ohcf"
8,num_cylinders,"four, five, six, three, eight"
9,fuel_system,"mpfi, 2bbl, mfi, 1bbl, idi, spdi"


In [50]:
cat2nums = {"num_doors" : {"four":4, "two" : 2}, 
            "num_cylinders": {"four":4, "six":6, "five":5, "eight":8, "two":2, "twelve":12, "three":3}}

In [51]:
autos.replace(cat2nums, inplace = True)

In [52]:
autos

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
6,1,158.0,audi,gas,std,4,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.40,8.5,110.0,5500.0,19,25,17710.0
8,1,158.0,audi,gas,turbo,4,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.40,8.3,140.0,5500.0,17,20,23875.0
10,2,192.0,bmw,gas,std,2,sedan,rwd,front,101.2,...,108,mpfi,3.50,2.80,8.8,101.0,5800.0,23,29,16430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,4,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,4,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,4,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,4,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0
