In [1]:
# This exercise focuses on replacing categorical data in a dataset with numerical values.
# The dataset used is student.csv, which contains information about students with both
# categorical and numerical attributes.
# The goal is to load the dataset using pandas and replace all the categorical fields
# (e.g., gender, course) with corresponding numeric codes.
# This step is essential for preparing the data for further processing such as machine learning.

import pandas as pd
import numpy as np
dataset = 'https://raw.githubusercontent.com/TrainingByPackt/Data-Science-with-Python/refs/heads/master/Chapter01/Data/student.csv'
df = pd.read_csv(dataset, header = 0)

In [2]:
df

Unnamed: 0,Student_id,Age,Gender,Grade,Employed
0,1,19,Male,1st Class,yes
1,2,20,Female,2nd Class,no
2,3,18,Male,1st Class,no
3,4,21,Female,2nd Class,no
4,5,19,Male,1st Class,no
5,6,20,Male,2nd Class,yes
6,7,19,Female,3rd Class,yes
7,8,21,Male,3rd Class,yes
8,9,22,Female,3rd Class,yes
9,10,21,Male,1st Class,no


In [5]:
df_categorical = df.select_dtypes(exclude=[np.number])
df_categorical

Unnamed: 0,Gender,Grade,Employed
0,Male,1st Class,yes
1,Female,2nd Class,no
2,Male,1st Class,no
3,Female,2nd Class,no
4,Male,1st Class,no
5,Male,2nd Class,yes
6,Female,3rd Class,yes
7,Male,3rd Class,yes
8,Female,3rd Class,yes
9,Male,1st Class,no


In [7]:
df_categorical['Grade'].unique()

array(['1st Class', '2nd Class', '3rd Class'], dtype=object)

In [8]:
df_categorical.Grade.value_counts()

3rd Class    80
2nd Class    80
1st Class    72
Name: Grade, dtype: int64

In [9]:
df_categorical.Grade.replace({
    "1st Class": 1,
    "2nd Class": 2,
    "3rd Class": 3
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [10]:
print(df_categorical.Grade.unique())
print(df_categorical.Grade.dtype)

[1 2 3]
int64


In [11]:
df_categorical

Unnamed: 0,Gender,Grade,Employed
0,Male,1,yes
1,Female,2,no
2,Male,1,no
3,Female,2,no
4,Male,1,no
5,Male,2,yes
6,Female,3,yes
7,Male,3,yes
8,Female,3,yes
9,Male,1,no


In [12]:
df_categorical.Gender.replace({"Male":0,"Female":1}, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [13]:
df_categorical

Unnamed: 0,Gender,Grade,Employed
0,0,1,yes
1,1,2,no
2,0,1,no
3,1,2,no
4,0,1,no
5,0,2,yes
6,1,3,yes
7,0,3,yes
8,1,3,yes
9,0,1,no


In [15]:
print(df_categorical.Employed.unique())
print(df_categorical.Employed.dtype)

['yes' 'no']
object


In [16]:
df_categorical.Employed.replace({"yes":1,"no":0}, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [17]:
print(df_categorical.Employed.unique())
print(df_categorical.Employed.dtype)

[1 0]
int64


In [19]:
df_categorical.head()

Unnamed: 0,Gender,Grade,Employed
0,0,1,1
1,1,2,0
2,0,1,0
3,1,2,0
4,0,1,0
