In [24]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.preprocessing import OneHotEncoder

In [25]:
# read in csv
data = pd.read_csv('./car.data', header=None, sep=',')
print(data)

          0      1      2     3      4     5      6
0     vhigh  vhigh      2     2  small   low  unacc
1     vhigh  vhigh      2     2  small   med  unacc
2     vhigh  vhigh      2     2  small  high  unacc
3     vhigh  vhigh      2     2    med   low  unacc
4     vhigh  vhigh      2     2    med   med  unacc
...     ...    ...    ...   ...    ...   ...    ...
1723    low    low  5more  more    med   med   good
1724    low    low  5more  more    med  high  vgood
1725    low    low  5more  more    big   low  unacc
1726    low    low  5more  more    big   med   good
1727    low    low  5more  more    big  high  vgood

[1728 rows x 7 columns]


In [26]:
# add in column names
data.columns = ['buying', 'maint', 'doors', 'person', 'lugboot', 'safety', 'class']
print(data)

     buying  maint  doors person lugboot safety  class
0     vhigh  vhigh      2      2   small    low  unacc
1     vhigh  vhigh      2      2   small    med  unacc
2     vhigh  vhigh      2      2   small   high  unacc
3     vhigh  vhigh      2      2     med    low  unacc
4     vhigh  vhigh      2      2     med    med  unacc
...     ...    ...    ...    ...     ...    ...    ...
1723    low    low  5more   more     med    med   good
1724    low    low  5more   more     med   high  vgood
1725    low    low  5more   more     big    low  unacc
1726    low    low  5more   more     big    med   good
1727    low    low  5more   more     big   high  vgood

[1728 rows x 7 columns]


# Data Exploration
`person` and `safety` have the strongest correlation to class, followed by `buying`, `maint`, `lugboot`. `doors` have very weak correlation to class.

In [34]:
profile = ProfileReport(data)
profile
profile.to_file("data profiling.html")

Summarize dataset: 100%|██████████| 20/20 [00:00<00:00, 29.68it/s, Completed]                    
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
# one hot encode the categories
columns = ['buying', 'maint', 'doors', 'person', 'lugboot', 'safety', 'class']
def encode_and_bind(original_dataframe, feature_to_encode):
    # for feature in feature_to_encode:
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

df = encode_and_bind(data, 'buying')
df = encode_and_bind(df, 'maint')
df = encode_and_bind(df, 'doors')
df = encode_and_bind(df, 'person')
df = encode_and_bind(df, 'lugboot')
df = encode_and_bind(df, 'safety')
# df = encode_and_bind(df, 'class')

# drop the original columns except for class
df.drop(['buying', 'maint', 'doors', 'person', 'lugboot', 'safety'], axis=1, inplace=True)


# moving the 'class' column to the last in the dataframe
df1 = df.pop('class') # remove column b and store it in df1
df['class']=df1 # add b series as a 'new' column.
print(df)
print(df.columns.values)

      buying_high  buying_low  buying_med  buying_vhigh  maint_high  \
0               0           0           0             1           0   
1               0           0           0             1           0   
2               0           0           0             1           0   
3               0           0           0             1           0   
4               0           0           0             1           0   
...           ...         ...         ...           ...         ...   
1723            0           1           0             0           0   
1724            0           1           0             0           0   
1725            0           1           0             0           0   
1726            0           1           0             0           0   
1727            0           1           0             0           0   

      maint_low  maint_med  maint_vhigh  doors_2  doors_3  ...  person_2  \
0             0          0            1        1        0  ...         

In [None]:
# fit the model
# predict