In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder #For encoding categorical variables

In [5]:
data=pd.read_csv("GPA_data.CSV")
data.head()

Unnamed: 0,ID,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA
0,8867,Male,22,Societies,10,1318792,154,1.76
1,1316,Female,24,Societies,9,874657,181,3.88
2,9443,Male,22,Societies,15,1098331,94,1.73
3,1829,Female,25,No,9,1680585,8,1.29
4,1989,Male,26,No,7,1361560,122,3.19


# Creating dummy variables using pandas library

# Convert the categorical variables into dummy variables using get_dummies function

In [6]:
df=pd.get_dummies(data[["Gender","Extra_Curricular"]])
df.head()

Unnamed: 0,Gender_Female,Gender_Male,Extra_Curricular_No,Extra_Curricular_Societies,Extra_Curricular_Sports
0,0,1,0,1,0
1,1,0,0,1,0
2,0,1,0,1,0
3,1,0,1,0,0
4,0,1,1,0,0


# Dummy Trapping (Removing one level from each categorical variable)

In [7]:
df.drop(["Gender_Female","Extra_Curricular_No"],axis=1,inplace=True)
df.head()

Unnamed: 0,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports
0,1,1,0
1,0,1,0
2,1,1,0
3,0,0,0
4,1,0,0


# Now this dummy data frame can be merged with oroginal data frame

In [8]:
data_new=pd.concat([data,df],axis=1)
data_new.head()

Unnamed: 0,ID,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports
0,8867,Male,22,Societies,10,1318792,154,1.76,1,1,0
1,1316,Female,24,Societies,9,874657,181,3.88,0,1,0
2,9443,Male,22,Societies,15,1098331,94,1.73,1,1,0
3,1829,Female,25,No,9,1680585,8,1.29,0,0,0
4,1989,Male,26,No,7,1361560,122,3.19,1,0,0


# Original categorical variables can be removed now

In [9]:
data_new.drop(["Gender","Extra_Curricular"],axis=1,inplace=True)
data_new.head() #This data frame can be used for further processes

Unnamed: 0,ID,Age,Study_Hours,Annual_Income,Distance_From_Home,GPA,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports
0,8867,22,10,1318792,154,1.76,1,1,0
1,1316,24,9,874657,181,3.88,0,1,0
2,9443,22,15,1098331,94,1.73,1,1,0
3,1829,25,9,1680585,8,1.29,0,0,0
4,1989,26,7,1361560,122,3.19,1,0,0


# Creating dummy variables with label & one hot encoding

In [10]:
data.head()

Unnamed: 0,ID,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA
0,8867,Male,22,Societies,10,1318792,154,1.76
1,1316,Female,24,Societies,9,874657,181,3.88
2,9443,Male,22,Societies,15,1098331,94,1.73
3,1829,Female,25,No,9,1680585,8,1.29
4,1989,Male,26,No,7,1361560,122,3.19


# First the label encoding should be performed

# Creating a label encoder

In [11]:
le=LabelEncoder()
le

LabelEncoder()

# Performing the label encoding for the categorical variables

In [13]:
data[["Gender","Extra_Curricular"]]=data[["Gender","Extra_Curricular"]].apply(lambda col : le.fit_transform(col))
data.head()

Unnamed: 0,ID,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA
0,8867,1,22,1,10,1318792,154,1.76
1,1316,0,24,1,9,874657,181,3.88
2,9443,1,22,1,15,1098331,94,1.73
3,1829,0,25,0,9,1680585,8,1.29
4,1989,1,26,0,7,1361560,122,3.19


# Removing unwanted columns

In [14]:
data.drop("ID",axis=1,inplace=True) #Drop ID column since that is unique for each observation
data.head()

Unnamed: 0,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA
0,1,22,1,10,1318792,154,1.76
1,0,24,1,9,874657,181,3.88
2,1,22,1,15,1098331,94,1.73
3,0,25,0,9,1680585,8,1.29
4,1,26,0,7,1361560,122,3.19


# Creating a one hot encoder

In [15]:
ohe=OneHotEncoder()
ohe

OneHotEncoder()

# One hot encoding for categorical variables seperately

In [None]:
# ohot_encoded_gen=ohe.fit_transform(np.array(data["Extra_Curricular"]).reshape(len(np.array(data["Extra_Curricular"])), 1)).toarray()
# arr_gen=ohot_encoded_gen[:,1].reshape(-1,1) #Dummy trapping is also done

In [16]:
ohot_encoded_gen=ohe.fit_transform(np.array(data["Gender"]).reshape(-1, 1)).toarray()
arr_gen=ohot_encoded_gen[:,1].reshape(-1,1) #Dummy trapping is also done

In [17]:
ohot_encoded_EC=ohe.fit_transform(np.array(data["Extra_Curricular"]).reshape(-1, 1)).toarray()
arr_EC=ohot_encoded_EC[:,1:] #Dummy trapping is also done

# Creating a dataframe with encoded data

In [18]:
df_ohot=pd.DataFrame(np.hstack((arr_gen,arr_EC)),columns=["Male","Societies","Sports"]) #Encoding will be done in alphabetic

In [19]:
df_ohot.head()

Unnamed: 0,Male,Societies,Sports
0,1.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,1.0,0.0
3,0.0,0.0,0.0
4,1.0,0.0,0.0


# Merging this data frame to the original data frame

In [20]:
data_new=pd.concat([data,df_ohot],axis=1)
data_new.head()

Unnamed: 0,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA,Male,Societies,Sports
0,1,22,1,10,1318792,154,1.76,1.0,1.0,0.0
1,0,24,1,9,874657,181,3.88,0.0,1.0,0.0
2,1,22,1,15,1098331,94,1.73,1.0,1.0,0.0
3,0,25,0,9,1680585,8,1.29,0.0,0.0,0.0
4,1,26,0,7,1361560,122,3.19,1.0,0.0,0.0


# Removing the original variables

In [None]:
data_new.drop(["Gender","Extra_Curricular"],axis=1,inplace=True)
data_new.head()

# Now the other processes can be performed to this data frame