In [184]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [136]:
# reading the data
data = pd.read_csv('/Users/mh/Desktop/201955_DS_Final/Student Performance - Training Set.csv')

In [137]:
data.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,Iraq,Iraq,lowerlevel,G-04,A,Science,S,Father,62,64,72,84,Yes,Good,Above-7,M
1,F,lebanon,lebanon,MiddleSchool,G-08,A,Arabic,S,Mum,45,58,52,43,Yes,Good,Under-7,H
2,M,Jordan,Jordan,MiddleSchool,G-07,B,Science,S,Father,52,10,13,6,No,Bad,Above-7,L
3,M,Iraq,Iraq,MiddleSchool,G-07,B,Biology,F,Father,98,90,86,71,Yes,Good,Under-7,H
4,M,Lybia,Lybia,lowerlevel,G-02,B,French,S,Mum,15,4,12,7,No,Good,Above-7,L


In [138]:
data.tail()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
451,M,Jordan,Jordan,lowerlevel,G-02,B,French,S,Mum,35,92,29,33,No,Bad,Under-7,M
452,F,KW,KuwaIT,lowerlevel,G-02,C,IT,F,Mum,24,35,23,22,Yes,Good,Above-7,M
453,M,KW,KuwaIT,lowerlevel,G-02,B,IT,F,Father,4,8,10,60,Yes,Good,Above-7,L
454,M,KW,KuwaIT,lowerlevel,G-02,B,IT,F,Father,25,27,0,69,No,Bad,Above-7,L
455,M,Egypt,Egypt,lowerlevel,G-04,A,English,S,Mum,19,74,32,17,No,Good,Above-7,L


In [139]:
# checking for missing values
data.isnull().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

In [140]:
# checking for missing values
data.isna().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

In [141]:
# checking the data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    456 non-null    object
 1   NationalITy               456 non-null    object
 2   PlaceofBirth              456 non-null    object
 3   StageID                   456 non-null    object
 4   GradeID                   456 non-null    object
 5   SectionID                 456 non-null    object
 6   Topic                     456 non-null    object
 7   Semester                  456 non-null    object
 8   Relation                  456 non-null    object
 9   raisedhands               456 non-null    int64 
 10  VisITedResources          456 non-null    int64 
 11  AnnouncementsView         456 non-null    int64 
 12  Discussion                456 non-null    int64 
 13  ParentAnsweringSurvey     456 non-null    object
 14  ParentschoolSatisfaction  

In [142]:
# checking the shape of data 
data.shape

(456, 17)

In [143]:
# checking the summary of data
data.describe()

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion
count,456.0,456.0,456.0,456.0
mean,46.774123,54.311404,38.096491,43.385965
std,31.052233,33.213918,26.814698,27.755371
min,0.0,0.0,0.0,1.0
25%,15.0,19.0,14.0,19.75
50%,50.0,64.5,33.0,40.0
75%,75.0,84.0,58.0,70.0
max,100.0,99.0,98.0,99.0


In [144]:
data['gender'].value_counts()

gender
M    292
F    164
Name: count, dtype: int64

In [145]:
# Performing binary encoding on the 'gender' column
data['gender'] = data['gender'].apply(lambda x: 1 if x == 'M' else 0)

In [146]:
# Getting the rows where NationalITy is not equal to PlaceofBirth
unequal_nationality_placeofbirth = data[data['NationalITy'] != data['PlaceofBirth']]


# It looks like the name of counrtys are not same in the columns 'National', So I'll drop NationalITy column

In [147]:
# Dropping the NationalITy column
data = data.drop(columns=['NationalITy'])

In [148]:
data.head()

Unnamed: 0,gender,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,Iraq,lowerlevel,G-04,A,Science,S,Father,62,64,72,84,Yes,Good,Above-7,M
1,0,lebanon,MiddleSchool,G-08,A,Arabic,S,Mum,45,58,52,43,Yes,Good,Under-7,H
2,1,Jordan,MiddleSchool,G-07,B,Science,S,Father,52,10,13,6,No,Bad,Above-7,L
3,1,Iraq,MiddleSchool,G-07,B,Biology,F,Father,98,90,86,71,Yes,Good,Under-7,H
4,1,Lybia,lowerlevel,G-02,B,French,S,Mum,15,4,12,7,No,Good,Above-7,L


In [149]:
data['PlaceofBirth'].value_counts()

PlaceofBirth
KuwaIT         172
Jordan         170
Iraq            20
lebanon         19
USA             15
SaudiArabia     13
Palestine       10
Tunis            9
Egypt            8
Lybia            6
Iran             6
Morocco          4
Syria            3
venzuela         1
Name: count, dtype: int64

In [150]:
# performing label encoding on the 'PlaceofBirth' column
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'PlaceofBirth' column
data['PlaceofBirth'] = label_encoder.fit_transform(data['PlaceofBirth'])


In [151]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    456 non-null    int64 
 1   PlaceofBirth              456 non-null    int64 
 2   StageID                   456 non-null    object
 3   GradeID                   456 non-null    object
 4   SectionID                 456 non-null    object
 5   Topic                     456 non-null    object
 6   Semester                  456 non-null    object
 7   Relation                  456 non-null    object
 8   raisedhands               456 non-null    int64 
 9   VisITedResources          456 non-null    int64 
 10  AnnouncementsView         456 non-null    int64 
 11  Discussion                456 non-null    int64 
 12  ParentAnsweringSurvey     456 non-null    object
 13  ParentschoolSatisfaction  456 non-null    object
 14  StudentAbsenceDays        

In [152]:
data['StageID'].value_counts()

StageID
MiddleSchool    240
lowerlevel      185
HighSchool       31
Name: count, dtype: int64

In [153]:
# Define a mapping dictionary
place_mapping = {
    'lowerlevel': 1,
    'MiddleSchool': 2,
    'HighSchool': 3,   
}

# Replace values in the 'StageID' column using the mapping dictionary
data['StageID'] = data['StageID'].replace(place_mapping)

In [154]:
data['GradeID'].value_counts()

GradeID
G-02    137
G-08    110
G-07     99
G-04     44
G-06     32
G-11     13
G-12      9
G-09      5
G-10      4
G-05      3
Name: count, dtype: int64

In [155]:
# performing label encoding on the 'GradeID' column
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'PlaceofBirth' column
data['GradeID'] = label_encoder.fit_transform(data['GradeID'])

In [156]:
data.head()

Unnamed: 0,gender,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,2,1,1,A,Science,S,Father,62,64,72,84,Yes,Good,Above-7,M
1,0,12,2,5,A,Arabic,S,Mum,45,58,52,43,Yes,Good,Under-7,H
2,1,3,2,4,B,Science,S,Father,52,10,13,6,No,Bad,Above-7,L
3,1,2,2,4,B,Biology,F,Father,98,90,86,71,Yes,Good,Under-7,H
4,1,5,1,0,B,French,S,Mum,15,4,12,7,No,Good,Above-7,L


In [157]:
data['SectionID'].value_counts()

SectionID
A    271
B    158
C     27
Name: count, dtype: int64

In [158]:
# performing label encoding on the 'SectionID' column

# Initialize LabelEncoder
label_encoder = LabelEncoder()
data['SectionID'] = label_encoder.fit_transform(data['SectionID'])

In [159]:
data.head()

Unnamed: 0,gender,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,2,1,1,0,Science,S,Father,62,64,72,84,Yes,Good,Above-7,M
1,0,12,2,5,0,Arabic,S,Mum,45,58,52,43,Yes,Good,Under-7,H
2,1,3,2,4,1,Science,S,Father,52,10,13,6,No,Bad,Above-7,L
3,1,2,2,4,1,Biology,F,Father,98,90,86,71,Yes,Good,Under-7,H
4,1,5,1,0,1,French,S,Mum,15,4,12,7,No,Good,Above-7,L


In [160]:
# performing label encoding on the 'Topic' column
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'Topich' column
data['Topic'] = label_encoder.fit_transform(data['Topic'])

In [161]:
data.head()

Unnamed: 0,gender,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,2,1,1,0,10,S,Father,62,64,72,84,Yes,Good,Above-7,M
1,0,12,2,5,0,0,S,Mum,45,58,52,43,Yes,Good,Under-7,H
2,1,3,2,4,1,10,S,Father,52,10,13,6,No,Bad,Above-7,L
3,1,2,2,4,1,1,F,Father,98,90,86,71,Yes,Good,Under-7,H
4,1,5,1,0,1,4,S,Mum,15,4,12,7,No,Good,Above-7,L


In [162]:
data['Semester'].value_counts()

Semester
F    235
S    221
Name: count, dtype: int64

In [163]:
# Performing binary encoding on the 'Semester' column
data['Semester'] = data['Semester'].apply(lambda x: 1 if x == 'S' else 0)

In [164]:
# Performing binary encoding on the 'Relation' column
data['Relation'] = data['Relation'].apply(lambda x: 1 if x == 'Father' else 0)

In [165]:
# Performing binary encoding on the 'ParentAnsweringSurvey' column
data['ParentAnsweringSurvey'] = data['ParentAnsweringSurvey'].apply(lambda x: 1 if x == 'Yes' else 0)

In [166]:
# Performing binary encoding on the 'ParentschoolSatisfaction' column
data['ParentschoolSatisfaction'] = data['ParentschoolSatisfaction'].apply(lambda x: 1 if x == 'Good' else 0)

In [167]:
data.head()

Unnamed: 0,gender,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,2,1,1,0,10,1,1,62,64,72,84,1,1,Above-7,M
1,0,12,2,5,0,0,1,0,45,58,52,43,1,1,Under-7,H
2,1,3,2,4,1,10,1,1,52,10,13,6,0,0,Above-7,L
3,1,2,2,4,1,1,0,1,98,90,86,71,1,1,Under-7,H
4,1,5,1,0,1,4,1,0,15,4,12,7,0,1,Above-7,L


In [168]:
data['StudentAbsenceDays'].value_counts()

StudentAbsenceDays
Under-7    273
Above-7    183
Name: count, dtype: int64

In [169]:
# Performing binary encoding on the 'StudentAbsenceDays' column
data['StudentAbsenceDays'] = data['StudentAbsenceDays'].apply(lambda x: 1 if x == 'Above-7' else 0)

In [172]:
# Define a mapping dictionary
place_mapping = {
    'H': 1,
    'L': 2,
    'M': 3,   
}

# Replace values in the 'Class' column using the mapping dictionary
data['Class'] = data['Class'].replace(place_mapping)

In [173]:
data.head()

Unnamed: 0,gender,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,2,1,1,0,10,1,1,62,64,72,84,1,1,1,3
1,0,12,2,5,0,0,1,0,45,58,52,43,1,1,0,1
2,1,3,2,4,1,10,1,1,52,10,13,6,0,0,1,2
3,1,2,2,4,1,1,0,1,98,90,86,71,1,1,0,1
4,1,5,1,0,1,4,1,0,15,4,12,7,0,1,1,2


In [175]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   gender                    456 non-null    int64
 1   PlaceofBirth              456 non-null    int64
 2   StageID                   456 non-null    int64
 3   GradeID                   456 non-null    int64
 4   SectionID                 456 non-null    int64
 5   Topic                     456 non-null    int64
 6   Semester                  456 non-null    int64
 7   Relation                  456 non-null    int64
 8   raisedhands               456 non-null    int64
 9   VisITedResources          456 non-null    int64
 10  AnnouncementsView         456 non-null    int64
 11  Discussion                456 non-null    int64
 12  ParentAnsweringSurvey     456 non-null    int64
 13  ParentschoolSatisfaction  456 non-null    int64
 14  StudentAbsenceDays        456 non-null    

In [None]:
# Now our data is ready for model building as we cn see all the columns are in numerical format

In [177]:
# Separate features (X) and target variable (y)
X = data.drop(columns=['Class'])  # Features
y = data['Class']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Training set - Features:", X_train.shape, "Target:", y_train.shape)
print("Testing set - Features:", X_test.shape, "Target:", y_test.shape)


Training set - Features: (364, 15) Target: (364,)
Testing set - Features: (92, 15) Target: (92,)


In [201]:
# Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print(" ")

# Training Confusion Matrix
print("Decision Tree - Training Confusion Matrix:")
print(confusion_matrix(y_train, dt_classifier.predict(X_train)))

# Testing Confusion Matrix
print("\nDecision Tree - Testing Confusion Matrix:")
print(confusion_matrix(y_test, dt_predictions))





Decision Tree Accuracy: 0.6521739130434783
 
Decision Tree - Training Confusion Matrix:
[[109   0   0]
 [  0 103   0]
 [  0   0 152]]

Decision Tree - Testing Confusion Matrix:
[[14  0 12]
 [ 1 18  2]
 [12  5 28]]
Decision Tree - Training Confusion Matrix:
[[109   0   0]
 [  0 103   0]
 [  0   0 152]]

Decision Tree - Testing Confusion Matrix:
[[14  0 12]
 [ 1 18  2]
 [12  5 28]]


In [193]:
#  Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print(" ")

# Training Confusion Matrix
print("Random Forest - Training Confusion Matrix:")
print(confusion_matrix(y_train, rf_classifier.predict(X_train)))

# Testing Confusion Matrix
print("\nRandom Forest - Testing Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))


Random Forest Accuracy: 0.7391304347826086
 
Random Forest - Training Confusion Matrix:
[[109   0   0]
 [  0 103   0]
 [  0   0 152]]

Random Forest - Testing Confusion Matrix:
[[14  0 12]
 [ 0 20  1]
 [ 7  4 34]]


In [194]:
#Bagging
bagging_classifier = BaggingClassifier(random_state=42)
bagging_classifier.fit(X_train, y_train)
bagging_predictions = bagging_classifier.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print(" ")

# Training Confusion Matrix
print("Bagging - Training Confusion Matrix:")
print(confusion_matrix(y_train, bagging_classifier.predict(X_train)))

# Testing Confusion Matrix
print("\nBagging - Testing Confusion Matrix:")
print(confusion_matrix(y_test, bagging_predictions))


Bagging Accuracy: 0.6847826086956522
 
Bagging - Training Confusion Matrix:
[[108   0   1]
 [  0 103   0]
 [  2   1 149]]

Bagging - Testing Confusion Matrix:
[[14  0 12]
 [ 0 20  1]
 [11  5 29]]


In [195]:
#Boosting
boosting_classifier = AdaBoostClassifier(random_state=42)
boosting_classifier.fit(X_train, y_train)
boosting_predictions = boosting_classifier.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
print("Boosting Accuracy:", boosting_accuracy)
print(" ")  

# Training Confusion Matrix
print("Boosting - Training Confusion Matrix:")
print(confusion_matrix(y_train, boosting_classifier.predict(X_train)))

# Testing Confusion Matrix
print("\nBoosting - Testing Confusion Matrix:")
print(confusion_matrix(y_test, boosting_predictions))


Boosting Accuracy: 0.6086956521739131
 
Boosting - Training Confusion Matrix:
[[ 28   0  81]
 [  0  91  12]
 [ 17   8 127]]

Boosting - Testing Confusion Matrix:
[[ 7  1 18]
 [ 0 16  5]
 [ 7  5 33]]




In [196]:
#SVM
svm_classifier = SVC(random_state=42)
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)
print(" ")

# Training Confusion Matrix
print("SVM - Training Confusion Matrix:")
print(confusion_matrix(y_train, svm_classifier.predict(X_train)))

# Testing Confusion Matrix
print("\nSVM - Testing Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions))


SVM Accuracy: 0.6086956521739131
 
SVM - Training Confusion Matrix:
[[76  2 31]
 [ 2 84 17]
 [48 27 77]]

SVM - Testing Confusion Matrix:
[[13  1 12]
 [ 0 20  1]
 [15  7 23]]


In [204]:
# Define confusion matrices
dt_cm_train = confusion_matrix(y_train, dt_classifier.predict(X_train))
dt_cm_test = confusion_matrix(y_test, dt_predictions)

rf_cm_train = confusion_matrix(y_train, rf_classifier.predict(X_train))
rf_cm_test = confusion_matrix(y_test, rf_predictions)

bagging_cm_train = confusion_matrix(y_train, bagging_classifier.predict(X_train))
bagging_cm_test = confusion_matrix(y_test, bagging_predictions)

boosting_cm_train = confusion_matrix(y_train, boosting_classifier.predict(X_train))
boosting_cm_test = confusion_matrix(y_test, boosting_predictions)

svm_cm_train = confusion_matrix(y_train, svm_classifier.predict(X_train))
svm_cm_test = confusion_matrix(y_test, svm_predictions)


# Function to print confusion matrix
def print_confusion_matrix(model_name, cm_train, cm_test):
    print("Confusion Matrix for", model_name, "- Training:")
    print(cm_train)
    print("\nConfusion Matrix for", model_name, "- Testing:")
    print(cm_test)

# Print confusion matrix for each algorithm
print_confusion_matrix("Decision Tree", dt_cm_train, dt_cm_test)
print_confusion_matrix("Random Forest", rf_cm_train, rf_cm_test)
print_confusion_matrix("Bagging", bagging_cm_train, bagging_cm_test)
print_confusion_matrix("Boosting", boosting_cm_train, boosting_cm_test)
print_confusion_matrix("SVM", svm_cm_train, svm_cm_test)


Confusion Matrix for Decision Tree - Training:
[[109   0   0]
 [  0 103   0]
 [  0   0 152]]

Confusion Matrix for Decision Tree - Testing:
[[14  0 12]
 [ 1 18  2]
 [12  5 28]]
Confusion Matrix for Random Forest - Training:
[[109   0   0]
 [  0 103   0]
 [  0   0 152]]

Confusion Matrix for Random Forest - Testing:
[[14  0 12]
 [ 0 20  1]
 [ 7  4 34]]
Confusion Matrix for Bagging - Training:
[[108   0   1]
 [  0 103   0]
 [  2   1 149]]

Confusion Matrix for Bagging - Testing:
[[14  0 12]
 [ 0 20  1]
 [11  5 29]]
Confusion Matrix for Boosting - Training:
[[ 28   0  81]
 [  0  91  12]
 [ 17   8 127]]

Confusion Matrix for Boosting - Testing:
[[ 7  1 18]
 [ 0 16  5]
 [ 7  5 33]]
Confusion Matrix for SVM - Training:
[[76  2 31]
 [ 2 84 17]
 [48 27 77]]

Confusion Matrix for SVM - Testing:
[[13  1 12]
 [ 0 20  1]
 [15  7 23]]
