## Import required packages

In [214]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder


## Read Input data from data directory

In [215]:
# Data to be predicted
maint="high"
doors='2'
persons=None
lug_boot="big"
safety="high"
class_value="good"


inputDf=pd.read_csv("./data/car.data",index_col=False)

inputDf


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class_value
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


## Explore Data

In [216]:
inputDf.shape
inputDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   buying       1728 non-null   object
 1   maint        1728 non-null   object
 2   doors        1728 non-null   object
 3   persons      1728 non-null   object
 4   lug_boot     1728 non-null   object
 5   safety       1728 non-null   object
 6   class_value  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


As per this, we don't have any null values in the input dataset

In [217]:
inputDf['class_value'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: class_value, dtype: int64

## Transform data


In [218]:
# Split input and output features

x_data=inputDf.drop(['buying'],axis=1)
y_data=inputDf['buying']

# Split data into train and test. Test size is 0.2

X_train, X_test, y_train, y_test=train_test_split(x_data,y_data,test_size=0.2)

# Encode text classifcation to numeric classication

enc=ce.OrdinalEncoder(cols=['maint','doors','persons','lug_boot','safety','class_value'])

X_train_enc=enc.fit_transform(X_train)
X_test_enc=enc.transform(X_test)

X_test_enc
lblEnc=LabelEncoder().fit(y_train)
y_train_enc=lblEnc.fit_transform(y_train)
y_test_enc=lblEnc.transform(y_test)



## Model training

Train the input data

In [219]:
xgbclf=XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.01,objective='binary:logistic')
xgbclf.fit(X_train_enc,y_train_enc)


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

## Model Prediction

In [220]:
predOutput=xgbclf.predict(X_test_enc)

## Model metrics

In [221]:
accuracy_score(y_test_enc,predOutput)

0.3236994219653179

Predict for 

Maintenance = High
Number of doors = 4
Lug Boot Size = Big
Safety = High
Class Value = Good

In [222]:
testDf=pd.DataFrame(columns=['maint','doors','persons','lug_boot','safety','class_value'])
testDf.loc[0]=[maint,doors,persons,lug_boot,safety,class_value]
enc.mapping
testDf=enc.transform(testDf)
predOutput=xgbclf.predict(testDf)
result=lblEnc.inverse_transform(predOutput)[0]
result


'low'

## Final Result : low