## Hands-on 2a
#### This hands-on explores the encoding of categorical data and data imputation

In [1]:
# Initialization
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Load dataset from csv file and show the first 5 rows of data
df = pd.read_csv("income_classification.csv")
df.head()

Unnamed: 0,Age,Education,Gender,City,Income
0,50,Bachelor,M,C,High
1,37,Bachelor,M,B,Low
2,26,Master,M,C,Medium
3,33,Bachelor,F,B,Low
4,39,Master,M,C,Medium


**To do:**
- Check the number of rows and columns using info()
- Print the number of duplicated rows

In [6]:
print(df.info())
print(df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Age        250 non-null    int64 
 1   Education  250 non-null    object
 2   Gender     250 non-null    object
 3   City       250 non-null    object
 4   Income     250 non-null    object
dtypes: int64(1), object(4)
memory usage: 9.9+ KB
None
38


**To do:**
- Remove duplicated rows if there are any
- Check the number of remaining rows using info()

In [7]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 212 entries, 0 to 211
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Age        212 non-null    int64 
 1   Education  212 non-null    object
 2   Gender     212 non-null    object
 3   City       212 non-null    object
 4   Income     212 non-null    object
dtypes: int64(1), object(4)
memory usage: 9.9+ KB


**To do:**
- Check the number of missing values for each column using isna().sum()

In [8]:
df.isna().sum()

Age          0
Education    0
Gender       0
City         0
Income       0
dtype: int64

**To do:**
- Calculate descriptive statistics

In [9]:
df.describe()

Unnamed: 0,Age
count,212.0
mean,40.981132
std,12.04217
min,0.0
25%,35.0
50%,40.0
75%,50.0
max,65.0


**To do:**
- Replace missing values with NaN
- Check the number of missing values for each column using isna().sum()
- Print the first 5 rows of the resulting dataframe (df)

In [10]:
df = df.replace({0: np.nan})
print(df.isna().sum())
df.head()

Age          7
Education    0
Gender       0
City         0
Income       0
dtype: int64


Unnamed: 0,Age,Education,Gender,City,Income
0,50.0,Bachelor,M,C,High
1,37.0,Bachelor,M,B,Low
2,26.0,Master,M,C,Medium
3,33.0,Bachelor,F,B,Low
4,39.0,Master,M,C,Medium


**To do:**
- Define mapping for ordinal data
- Encode ordinal data using map(). 
- Print the first 5 rows of the resulting dataframe (df).

In [11]:
df['Education'].unique()

array(['Bachelor', 'Master', 'Diploma', 'PhD'], dtype=object)

In [12]:
mapping = {'Diploma':1, 'Bachelor':2, 'Master':3, 'PhD':4}
df['Education'] = df['Education'].map(mapping)
df.head()

Unnamed: 0,Age,Education,Gender,City,Income
0,50.0,2,M,C,High
1,37.0,2,M,B,Low
2,26.0,3,M,C,Medium
3,33.0,2,F,B,Low
4,39.0,3,M,C,Medium


**To do:**
- Use pd.get_dummies() to encode nominal data using one-hot encoding and store the resulting dataframe in df1.
- Print first 5 rows of df1 and verify that the resulting dataframe is correct.

In [13]:
df1 = pd.get_dummies(df, columns=['Gender', 'City'])
df1.head()

Unnamed: 0,Age,Education,Income,Gender_F,Gender_M,Gender_f,Gender_m,City_A,City_B,City_C
0,50.0,2,High,False,True,False,False,False,False,True
1,37.0,2,Low,False,True,False,False,False,True,False
2,26.0,3,Medium,False,True,False,False,False,False,True
3,33.0,2,Low,True,False,False,False,False,True,False
4,39.0,3,Medium,False,True,False,False,False,False,True


**To do:**
- Rectify the problem in the previous step.
- Use pd.get_dummies() to encode nominal data using one-hot encoding and store the resulting dataframe in df2.
- Print the first 5 rows of df2.

In [14]:
df['Gender'] = df['Gender'].str.upper()
df2 = pd.get_dummies(df, columns=['Gender', 'City'])
df2.head()

Unnamed: 0,Age,Education,Income,Gender_F,Gender_M,City_A,City_B,City_C
0,50.0,2,High,False,True,False,False,True
1,37.0,2,Low,False,True,False,True,False
2,26.0,3,Medium,False,True,False,False,True
3,33.0,2,Low,True,False,False,True,False
4,39.0,3,Medium,False,True,False,False,True


**To do:**
- Impute missing data with median value of the column
- Check the number of missing values for each column using isna().sum()

In [15]:
df2['Age'] = df2['Age'].fillna(df2['Age'].median())
df2.isna().sum()

Age          0
Education    0
Income       0
Gender_F     0
Gender_M     0
City_A       0
City_B       0
City_C       0
dtype: int64

Use the processed dataset to train and evaluate a k-Nearest Neighbors model

In [16]:
# Seperate to feature (X) and target (y)
X = df2.drop(columns=["Income"])
y = df2["Income"]

# Split to training and testing sets
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)

# Train and evaluate a knn model
knn = KNeighborsClassifier().fit(X_train, y_train)
print(f"knn accuracy: {knn.score(X_test, y_test):.2%}")

knn accuracy: 88.68%
