In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv('studentdata.csv')

In [3]:
df.shape

(480, 10)

In [4]:
df.head()

Unnamed: 0,gender,StageID,Topic,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentschoolSatisfaction,Class
0,M,lowerlevel,IT,Father,15,16,2,20,Good,M
1,M,lowerlevel,IT,Father,20,20,3,25,Good,M
2,M,lowerlevel,IT,Father,10,7,0,30,Bad,L
3,M,lowerlevel,IT,Father,30,25,5,35,Bad,L
4,M,lowerlevel,IT,Father,40,50,12,50,Bad,M


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    480 non-null    object
 1   StageID                   480 non-null    object
 2   Topic                     480 non-null    object
 3   Relation                  476 non-null    object
 4   raisedhands               480 non-null    int64 
 5   VisITedResources          480 non-null    int64 
 6   AnnouncementsView         480 non-null    int64 
 7   Discussion                480 non-null    int64 
 8   ParentschoolSatisfaction  480 non-null    object
 9   Class                     480 non-null    object
dtypes: int64(4), object(6)
memory usage: 37.6+ KB


In [6]:
#Relation got null values so we need to fill this with most frequent values
#Apply Onehot encoder to  gender,topic and Relation 
#Apply Ordinal encoder to StageID and ParentschoolSatisfaction

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:9],df.iloc[:,-1],
                                                test_size=0.2,random_state=91)

# Without Column Transformer

In [8]:
#apply Simple Imputer
simpleI = SimpleImputer(strategy='most_frequent')
X_train['Relation'] = simpleI.fit_transform(X_train[['Relation']])

# apply transform to the test data
X_test['Relation'] = simpleI.transform(X_test[['Relation']])
                                 
    
    
X_train.shape

(384, 9)

In [9]:
#OneHot encoding

In [10]:
oe = OneHotEncoder(drop='first',sparse=False)
X_train_oe = oe.fit_transform(X_train[['gender','Topic','Relation']])

# apply transform to the test data
X_test_oe = oe.transform(X_test[['gender','Topic','Relation']])
                                 
    
    
X_train_oe.shape

(384, 13)

In [11]:
#Ordinal Encoding

In [12]:
ordinal=OrdinalEncoder(categories=[['lowerlevel','MiddleSchool','HighSchool'],['Bad','Good']])

X_train_or = ordinal.fit_transform(X_train[['StageID','ParentschoolSatisfaction']])

# apply transform to the test data
X_test_or = ordinal.transform(X_test[['StageID','ParentschoolSatisfaction']])

In [13]:
X_train_or.shape

(384, 2)

In [14]:
X_train_or

array([[1., 0.],
       [0., 1.],
       [2., 1.],
       [1., 1.],
       [1., 0.],
       [2., 1.],
       [0., 1.],
       [0., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [0., 0.],
       [2., 0.],
       [0., 1.],
       [1., 0.],
       [1., 1.],
       [2., 1.],
       [0., 1.],
       [1., 1.],
       [1., 0.],
       [1., 1.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 1.],
       [1., 1.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [2., 1.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 1.],
       [0., 0.],
       [2., 0.],
       [1., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 1.],
       [1., 0.],
       [2., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 1.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [1., 1.

In [15]:
X_train_new=X_train.drop(columns=['StageID','ParentschoolSatisfaction','gender','Topic','Relation'],axis=1).values

In [16]:
X_train_new.shape


(384, 4)

In [17]:
X_test_new=X_test.drop(columns=['StageID','ParentschoolSatisfaction','gender','Topic','Relation'],axis=1).values

In [18]:
X_train_transformed = np.concatenate((X_train_oe,X_train_or,X_train_new),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_oe,X_test_or,X_test_new),axis=1)

print(X_train_transformed.shape,X_test_transformed.shape)

(384, 19) (96, 19)


In [19]:
# also the test data
X_test_transformed = np.concatenate((X_test_oe,X_test_or,X_test_new),axis=1)


# Column Transformer

In [20]:
from sklearn.compose import ColumnTransformer


In [21]:
trans=ColumnTransformer(transformers=[
    ('missingvalues', SimpleImputer(strategy='most_frequent'),[3]),
    ('Ordinal', OrdinalEncoder(categories=[['lowerlevel','MiddleSchool','HighSchool'],['Bad','Good']]),[1,8]),
    ('onehot', OneHotEncoder(sparse=False,drop='first'),[0,2,3]),
            
],remainder='passthrough')

In [22]:
X_train=trans.fit_transform(X_train)
X_test=trans.transform(X_test)

In [24]:
print(X_train.shape, X_test.shape)

(384, 20) (96, 20)
