In [217]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Group 9: Python & Data Science: Car evaluation dataset
*This Notebook was written by Shadrack T. John* *https://thisisshadrack.netlify.app/* <br>


In [218]:
#Lets import the dataset we've downloaded from kaggle
df=pd.read_csv('car_evaluation.csv')

In [219]:
df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
vhigh      1727 non-null object
vhigh.1    1727 non-null object
2          1727 non-null object
2.1        1727 non-null object
small      1727 non-null object
low        1727 non-null object
unacc      1727 non-null object
dtypes: object(7)
memory usage: 94.5+ KB


In [221]:
len(df)

1727

In [222]:
pd.options.display.max_rows

100

In [223]:
#Let's update up to 100 max_rows
#pd.options.display.max_rows=200

In [224]:
df.head(100)

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
5,vhigh,vhigh,2,2,big,low,unacc
6,vhigh,vhigh,2,2,big,med,unacc
7,vhigh,vhigh,2,2,big,high,unacc
8,vhigh,vhigh,2,4,small,low,unacc
9,vhigh,vhigh,2,4,small,med,unacc


In [225]:
## So far we have a ploblem, columns that doesn't make any sense, we also have more, 5more no_of doors. 
## This can be solved by mapping.
## Mapping string values to numerical values
## Before that I should rename the columns, they need to make sense before all that

In [226]:
df=df.rename(columns= {'buying':'vhigh', 'vhigh.1': 'maintenance','2' : 'doors', '2.1': 'persons'  ,'small':'lug_boot', 'low': 'safety', 'unacc': 'class'})

In [227]:
df.head(1)

Unnamed: 0,vhigh,maintenance,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc


In [228]:
## Let's save this dataset so far
df.to_csv('modified_car_evaluation.csv')

In [229]:
## Checking if changes applied..
df.head(3)

Unnamed: 0,vhigh,maintenance,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc


In [230]:
## Lets add the row we renamed as the first row in a new dataset
new_row = {'buying': 'vhigh', 'maintenance': 'vhigh', 'doors': 2, 'persons': '2', 'lugboot': 'small', 'safety': 'low', 'class': 'unacc'}

In [231]:
insert_index = 0 

In [232]:
new_df = pd.DataFrame([new_row])

In [233]:
## Split the original DataFrame into two parts and insert the new row in between
df = pd.concat([df.iloc[:insert_index], new_df, df.iloc[insert_index:]], ignore_index=True, sort=True)

In [234]:
## Create a DataFrame from the new row
new_df = pd.DataFrame([new_row])

In [235]:
## Concatenate the new DataFrame with the existing DataFrame
df = pd.concat([new_df, df], ignore_index=True, sort=True)

In [236]:
# Append the new row at the top
df = pd.concat([pd.DataFrame(new_row, index=[0]), df], ignore_index=True, sort=True)

In [238]:
df

Unnamed: 0,buying,class,doors,lug_boot,lugboot,maintenance,persons,safety,vhigh
0,vhigh,unacc,2,,small,vhigh,2,low,
1,vhigh,unacc,2,,small,vhigh,2,low,
2,vhigh,unacc,2,,small,vhigh,2,low,
3,,unacc,2,small,,vhigh,2,med,vhigh
4,,unacc,2,small,,vhigh,2,high,vhigh
5,,unacc,2,med,,vhigh,2,low,vhigh
6,,unacc,2,med,,vhigh,2,med,vhigh
7,,unacc,2,med,,vhigh,2,high,vhigh
8,,unacc,2,big,,vhigh,2,low,vhigh
9,,unacc,2,big,,vhigh,2,med,vhigh


In [239]:
## Defining mapping for numerical values
doors_mapping = {'2': 2,'3':3, '4':4, '5more': 5}
df['doors'] = df['doors'].map(doors_mapping)


In [240]:
## Handling '5more'
df['doors'] = df['doors'].replace('5more', 5)
## Now, the 'doors' column contains numerical values

In [241]:
df.head(200)

Unnamed: 0,buying,class,doors,lug_boot,lugboot,maintenance,persons,safety,vhigh
0,vhigh,unacc,,,small,vhigh,2,low,
1,vhigh,unacc,,,small,vhigh,2,low,
2,vhigh,unacc,,,small,vhigh,2,low,
3,,unacc,2.0,small,,vhigh,2,med,vhigh
4,,unacc,2.0,small,,vhigh,2,high,vhigh
5,,unacc,2.0,med,,vhigh,2,low,vhigh
6,,unacc,2.0,med,,vhigh,2,med,vhigh
7,,unacc,2.0,med,,vhigh,2,high,vhigh
8,,unacc,2.0,big,,vhigh,2,low,vhigh
9,,unacc,2.0,big,,vhigh,2,med,vhigh


In [242]:
## What's left to deal with is 'more' value in 'persons' column without specifying an exact number 
## can be a bit more challenging since there's no clear numerical information provided. 

In [243]:
df

Unnamed: 0,buying,class,doors,lug_boot,lugboot,maintenance,persons,safety,vhigh
0,vhigh,unacc,,,small,vhigh,2,low,
1,vhigh,unacc,,,small,vhigh,2,low,
2,vhigh,unacc,,,small,vhigh,2,low,
3,,unacc,2.0,small,,vhigh,2,med,vhigh
4,,unacc,2.0,small,,vhigh,2,high,vhigh
5,,unacc,2.0,med,,vhigh,2,low,vhigh
6,,unacc,2.0,med,,vhigh,2,med,vhigh
7,,unacc,2.0,med,,vhigh,2,high,vhigh
8,,unacc,2.0,big,,vhigh,2,low,vhigh
9,,unacc,2.0,big,,vhigh,2,med,vhigh


In [244]:
## Checking for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 9 columns):
buying         3 non-null object
class          1730 non-null object
doors          1727 non-null float64
lug_boot       1727 non-null object
lugboot        3 non-null object
maintenance    1730 non-null object
persons        1730 non-null object
safety         1730 non-null object
vhigh          1727 non-null object
dtypes: float64(1), object(8)
memory usage: 121.7+ KB


In [245]:
df.isna()
#df.isna().count()

Unnamed: 0,buying,class,doors,lug_boot,lugboot,maintenance,persons,safety,vhigh
0,False,False,True,True,False,False,False,False,True
1,False,False,True,True,False,False,False,False,True
2,False,False,True,True,False,False,False,False,True
3,True,False,False,False,True,False,False,False,False
4,True,False,False,False,True,False,False,False,False
5,True,False,False,False,True,False,False,False,False
6,True,False,False,False,True,False,False,False,False
7,True,False,False,False,True,False,False,False,False
8,True,False,False,False,True,False,False,False,False
9,True,False,False,False,True,False,False,False,False


In [246]:
## Double checking
pd.isnull(None)

True

In [247]:
## I have no null values
## Good to go Cive :) Smile..

As all the columns are categorical, check for unique values of each column

In [82]:
for i in df.columns:
    print(df[i].unique(),"\t",df[i].nunique())
  ## Print unique values && and count..  

['vhigh' 'high' 'med' 'low'] 	 4
['vhigh' 'high' 'med' 'low'] 	 4
[2 3 4 5] 	 4
['2' '4' 'more'] 	 3
['small' 'med' 'big'] 	 3
['med' 'high' 'low'] 	 3
['unacc' 'acc' 'vgood' 'good'] 	 4


In [81]:
## ['vhigh' 'high' 'med' 'low']: This column contains 4 unique values: 
## 'vhigh,' 'high,' 'med,' and 'low,' and each of these values has a count of 4 as in the output above.
## [2 3 4 5]: This column contains 4 unique values: 2, 3, 4, and 5, and each of these values has a count of 4 pia.
## ['2' '4' 'more']: This column contains 3 unique values: '2,' '4,' and 'more,' 
##and each of these values has a count associated with it.

##['small' 'med' 'big']: This column contains 3 unique values: 'small,' 'med,' and 'big,' 
## and each of these values has a count associated with it.
## ['med' 'high' 'low']: This column contains 3 unique values: 'med,' 'high,' and 'low,' 
## and each of these values has a count associated with it.
## ['unacc' 'acc' 'vgood' 'good']: This column contains 4 unique values: 
##'unacc,' 'acc,' 'vgood,' and 'good,' and each of these values has a count of 4.

Check how these unique categories are distributed among the columns

In [83]:
for i in df.columns:
    print(df[i].value_counts())
    print()

high     432
med      432
low      432
vhigh    431
Name: buying, dtype: int64

high     432
med      432
low      432
vhigh    431
Name: maintenance, dtype: int64

5    432
4    432
3    432
2    431
Name: doors, dtype: int64

more    576
4       576
2       575
Name: persons, dtype: int64

med      576
big      576
small    575
Name: lug_boot, dtype: int64

high    576
med     576
low     575
Name: safety, dtype: int64

unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64

