In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('studentdata.csv')

In [3]:
df.shape

(480, 10)

In [4]:
df.head()

Unnamed: 0,gender,StageID,Topic,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentschoolSatisfaction,Class
0,M,lowerlevel,IT,Father,15,16,2,20,Good,M
1,M,lowerlevel,IT,Father,20,20,3,25,Good,M
2,M,lowerlevel,IT,Father,10,7,0,30,Bad,L
3,M,lowerlevel,IT,Father,30,25,5,35,Bad,L
4,M,lowerlevel,IT,Father,40,50,12,50,Bad,M


In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Class']),
                                                 df['Class'],
                                                 test_size=0.2,
                                                random_state=91)

In [6]:
X_train

Unnamed: 0,gender,StageID,Topic,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentschoolSatisfaction
183,F,MiddleSchool,Arabic,Father,25,15,32,53,Bad
346,F,lowerlevel,French,Mum,24,97,15,14,Good
228,M,HighSchool,Math,Mum,73,84,77,81,Good
470,M,MiddleSchool,History,Father,81,86,86,41,Good
23,M,MiddleSchool,IT,Father,2,0,2,50,Bad
...,...,...,...,...,...,...,...,...,...
349,M,lowerlevel,French,Mum,15,4,12,7,Good
210,M,MiddleSchool,Spanish,Mum,27,90,82,14,Good
362,M,lowerlevel,Arabic,Father,90,98,41,38,Good
174,F,lowerlevel,French,Father,50,62,73,43,Bad


In [7]:
y_train

183    M
346    H
228    H
470    M
23     L
      ..
349    L
210    H
362    H
174    M
178    H
Name: Class, Length: 384, dtype: object

In [8]:
y_train=pd.DataFrame(y_train, columns=['Class'])

In [9]:
y_train

Unnamed: 0,Class
183,M
346,H
228,H
470,M
23,L
...,...
349,L
210,H
362,H
174,M


In [10]:
# imputation transformer
t1 = ColumnTransformer([
    ('impute_relation',SimpleImputer(strategy='most_frequent'),[3])
],remainder='passthrough')

In [11]:
# one hot encoding
t2 = ColumnTransformer([
    ('ohe',OneHotEncoder(sparse=False,handle_unknown='ignore'),[0,2,3])
],remainder='passthrough')

In [12]:
t3=ColumnTransformer([
    ('Ordinal', OrdinalEncoder(),[1,8])])

In [13]:
t4 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,9))
])

In [14]:
# t5 = ColumnTransformer([
#     ('label', LabelEncoder(),[0])
# ])

In [15]:
# # Feature selection
# t6 = SelectKBest(score_func=chi2,k=8)

In [16]:
t7 = LogisticRegression()

In [17]:
pipe = Pipeline([
    ('t1',t1),
    ('t2',t2),
    ('t3',t3),
    ('t4',t4),
#     ('t5',t5),
#     ('t6',t6),
    ('t7',t7),
])

In [18]:
# train
pipe.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [19]:
# Predict
y_pred = pipe.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.53125

# Cross Validation using Pipeline

In [21]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.484483937115516