## kNN Classifier Model
##### Predict the grade using CAT 1, CAT 2, FAT marks, Attendance and current CGPA

In [2]:
# initial imports

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np



In [3]:
# reading the dataset

student_data = pd.read_csv("./student-data.csv")
student_data.head()

Unnamed: 0,Cat 1,Cat 2,Fat,Grade,Attendance,CGPA
0,34,37,78,B,90,8.7
1,38,42,89,S,100,9.1
2,40,42,90,S,40,7.3
3,41,37,86,A,87,8.5
4,45,45,90,S,100,9.4


In [4]:
# info about dataset
student_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Cat 1       42 non-null     int64  
 1   Cat 2       42 non-null     int64  
 2   Fat         42 non-null     int64  
 3   Grade       42 non-null     object 
 4   Attendance  42 non-null     int64  
 5   CGPA        42 non-null     float64
dtypes: float64(1), int64(4), object(1)
memory usage: 2.1+ KB


In [12]:
# extracting relevant input and output features
X = student_data[['Cat 1', 'Cat 2', 'Fat']]
y = student_data['Grade']

print("X shape: {}, y shape: {}".format(X.shape, y.shape))

X shape: (42, 3), y shape: (42,)


In [13]:
# train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train.shape

(33, 3)

In [14]:
# instantiating classfier object with 5 neighbours parameter and fitting it
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

KNeighborsClassifier()

In [15]:
# getting output and comparing with test data 
y_pred = knn.predict(X_test)

y_actual = np.array(y_test)

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_actual.reshape(len(y_actual), 1)),1))

[['F' 'F']
 ['A' 'B']
 ['S' 'S']
 ['A' 'A']
 ['C' 'C']
 ['S' 'S']
 ['B' 'B']
 ['C' 'C']
 ['A' 'B']]


In [16]:
# geting training accuracy
print(knn.score(X_train, y_train))

0.7878787878787878


In [17]:
# getting testing accuracy
print(knn.score(X_test, y_test))

0.7777777777777778


### kNN Regression
##### To get FAT marks based on C1 and C2 marks

In [109]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [110]:
student_data = pd.read_csv("./student-data.csv")
student_data.head()

Unnamed: 0,Cat 1,Cat 2,Fat,Grade,Attendance,CGPA
0,34,37,78,B,90,8.7
1,38,42,89,S,100,9.1
2,40,42,90,S,40,7.3
3,41,37,86,A,87,8.5
4,45,45,90,S,100,9.4


In [111]:
X = student_data[['Cat 1', 'Cat 2']]
y = student_data['Fat']
print(X.shape, y.shape)

(42, 2) (42,)


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape, y_train.shape)

(31, 2) (31,)


In [121]:
knn = KNeighborsRegressor(n_neighbors=5)

knn.fit(X_train, y_train)

KNeighborsRegressor()

In [122]:
y_pred = knn.predict(X_test)
y_actual = np.array(y_test)

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_actual.reshape(len(y_actual), 1)),1))

[[68.6 63. ]
 [79.4 93. ]
 [87.6 92. ]
 [85.6 90. ]
 [32.6 12. ]
 [46.6 22. ]
 [78.8 80. ]
 [44.4 42. ]
 [77.4 87. ]
 [35.6 27. ]
 [79.2 78. ]]


In [123]:
knn.score(X_train, y_train)

0.807828434373338

In [124]:
knn.score(X_test, y_test)

0.8475833649829092

## Random Forest Regressor
##### Predict Grade using factors such as CAT 1, CAT 2, FAT marks, Attendance and current CGPA

In [50]:
# import the Random Forest Regressor module and create an instance of it
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(max_depth=10, random_state=0)

In [51]:
# reading the dataset

student_data = pd.read_csv("./student-data.csv")
student_data.head()

Unnamed: 0,Cat 1,Cat 2,Fat,Grade,Attendance,CGPA
0,34,37,78,B,90,8.7
1,38,42,89,S,100,9.1
2,40,42,90,S,40,7.3
3,41,37,86,A,87,8.5
4,45,45,90,S,100,9.4


In [52]:
# since grade is a of object data type, and the RFR object expects it to be of float type, we use LabelEncoder to convert the objects to float values
# as per the number of classes
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

student_data['Grade'] = encoder.fit_transform(student_data['Grade'])
student_data.head()

Unnamed: 0,Cat 1,Cat 2,Fat,Grade,Attendance,CGPA
0,34,37,78,1,90,8.7
1,38,42,89,6,100,9.1
2,40,42,90,6,40,7.3
3,41,37,86,0,87,8.5
4,45,45,90,6,100,9.4


In [53]:
# extract the required output and input variables
X = student_data[['Cat 1', 'Cat 2', 'Fat', 'Attendance', 'CGPA']]
y = student_data['Grade']

In [54]:
# create the test and train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
print(X_train.shape)

(33, 5)


In [55]:
# train the model on the training data
RFR.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, random_state=0)

In [56]:
# computing the predictions using testing data and comparing the outputs
y_pred = RFR.predict(X_test)
y_actual = np.array(y_test)

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_actual.reshape(len(y_actual), 1)),1))

[[4.71 5.  ]
 [0.79 1.  ]
 [5.58 6.  ]
 [0.41 0.  ]
 [1.69 2.  ]
 [4.74 6.  ]
 [1.28 1.  ]
 [1.45 2.  ]
 [0.07 1.  ]]


In [57]:
# generating the training accuracy of the model
RFR.score(X_train, y_train)

0.9722382072368421

In [58]:
# generating the testing accuracy of the model
RFR.score(X_test, y_test)

0.9226772727272727

### Random Forest Regressor
##### To predict FAT marks based on CAT 1 and CAT 2 marks

In [63]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(max_depth=10, random_state=0)

In [64]:
# reading the dataset
student_data = pd.read_csv("./student-data.csv")
student_data.head()

Unnamed: 0,Cat 1,Cat 2,Fat,Grade,Attendance,CGPA,DA1,DA2,DA3,Credits?
6,48,46,92,S,100,8.95,,,,
36,46,42,93,S,95,9.0,,,,
17,44,42,86,A,89,8.1,,,,
14,42,41,87,A,94,8.6,,,,
13,23,14,35,F,95,6.1,,,,


In [94]:
# extract the required output and input variables
X = student_data[['Cat 1', 'Cat 2']]
y = student_data['Fat']

In [95]:
# train and test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print(X_train.shape, y_train.shape)

(31, 2) (31,)


In [96]:
# fitting the model
RFR.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, random_state=0)

In [97]:
# computing the predictions using testing data and comparing the outputs
y_pred = RFR.predict(X_test)
y_actual = np.array(y_test)

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_actual.reshape(len(y_actual), 1)),1))

[[66.21 63.  ]
 [85.31 93.  ]
 [86.54 92.  ]
 [84.71 90.  ]
 [20.03 12.  ]
 [49.33 22.  ]
 [64.03 80.  ]
 [45.63 42.  ]
 [73.38 87.  ]
 [20.03 27.  ]
 [74.49 78.  ]]


In [100]:
# training accuracy
RFR.score(X_train, y_train)

0.9578636185735647

In [101]:
# testing accuracy
RFR.score(X_test, y_test)

0.848215363653627

### Predicting externals marks (FAT) based on Internals (CAT1, CAT2, DA1, DA2, DA3)

In [213]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
RFR = RandomForestRegressor(max_depth=5, random_state=0)

In [215]:
# reading the dataset
student_data = pd.read_csv("./internals.csv")
# student_data = student_data.sample(frac=1)

# taking input for class of 70 - for realistic emulation of class grades distribution
test_data = pd.read_csv("testInternals.csv")
student_data.head()

Unnamed: 0,CAT1,CAT2,DA1,DA2,DA3,FAT
0,6,7,9,8,10,30
1,8,8,8,8,10,38
2,9,7,7,10,10,34
3,5,6,8,9,9,54
4,9,10,7,9,10,49


In [216]:
# extract the required output and input variables
X_train= student_data[['CAT1', 'CAT2', "DA1", "DA2", "DA3"]]
y_train = student_data['FAT']
print(X_train.shape, y_train.shape)

(500, 5) (500,)


In [219]:
# fitting the model
RFR.fit(X_train, y_train)

RandomForestRegressor(max_depth=5, random_state=0)

In [220]:
# computing the predictions using testing data and comparing the outputs

X_test = test_data[['CAT1', 'CAT2', "DA1", "DA2", "DA3"]]
y_test = test_data['FAT']

y_pred = RFR.predict(X_test)
y_actual = np.array(y_test)

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_actual.reshape(len(y_actual), 1)),1))

[[44.98217643 31.        ]
 [45.88523071 39.        ]
 [46.36913456 33.        ]
 [50.62247943 47.        ]
 [42.16097824 41.        ]
 [44.29195993 41.        ]
 [44.57485985 53.        ]
 [67.32936323 77.        ]
 [69.65246763 59.        ]
 [68.23572792 69.        ]
 [67.86004203 64.        ]
 [70.52336477 61.        ]
 [66.36095531 69.        ]
 [67.63445451 73.        ]
 [66.50092421 74.        ]
 [68.21732916 60.        ]
 [67.55593534 70.        ]
 [68.09083424 60.        ]
 [67.76342861 68.        ]
 [66.55057283 75.        ]
 [68.38643925 66.        ]
 [68.16465763 67.        ]
 [68.59877296 60.        ]
 [67.52245391 61.        ]
 [67.46133724 67.        ]
 [66.93517929 79.        ]
 [67.57971772 70.        ]
 [67.82424176 78.        ]
 [67.90550741 72.        ]
 [66.88015514 68.        ]
 [68.00910668 74.        ]
 [66.97056444 70.        ]
 [68.32058765 57.        ]
 [67.07293149 59.        ]
 [68.23825571 58.        ]
 [67.58964094 58.        ]
 [66.83849489 59.        ]
 

In [221]:
# training accuracy
RFR.score(X_train, y_train)

0.7637579355359606

In [222]:
# testing accuracy
RFR.score(X_test, y_test)

0.745253983619675

In [224]:
internal = np.empty(len(y_pred), dtype=object)
counter = 0

for i in X_test.index:
  internal[counter]=round((15/50)*X_test['CAT1'][i],2) + round((15/50)*X_test['CAT2'][i],2) + X_test['DA1'][i] + X_test['DA2'][i] + X_test['DA3'][i]
  counter += 1

print(internal)
totPred = internal + (0.4)*y_pred

[31.8 27.0 29.4 29.2 30.9 31.5 30.1 37.8 32.6 33.6 36.6 34.7 35.5 36.6
 37.2 36.2 34.8 42.2 38.4 44.7 42.6 43.2 43.2 40.8 39.7 42.1 39.5 42.7
 41.0 41.0 41.2 38.2 40.4 37.2 42.1 41.3 38.5 38.5 47.0 46.3 45.2 47.3
 44.6 48.5 52.4 47.6 49.8 47.3 46.1 45.7 49.4 46.3 47.7 47.2 47.3 49.0
 45.2 43.1 50.9 49.8 49.4 53.6 50.2 54.9 52.2 46.6 51.0 49.1 52.4 54.7]


In [225]:
print(totPred)

[49.79287057387057 45.35409228351453 47.94765382395383 49.44899177097412
 47.76439129785774 49.21678397244942 47.92994393973136 64.73174529299632
 60.460987050550614 60.89429116815576 63.744016810388906 62.90934590865521
 62.04438212493709 63.65378180416956 63.80036968311828 63.4869316622032
 61.822374135918224 69.43633369651839 65.50537144303824 71.32022913109049
 69.95457569998152 70.46586305315266 70.63950918581725 67.80898156339669
 66.68453489438663 68.87407171597039 66.53188708740929 69.8296967059761
 68.16220296224283 67.75206205730203 68.4036426725163 64.98822577737265
 67.72823505952667 64.02917259536969 69.39530228461257 68.33585637545212
 65.23539795464544 65.68303877301288 74.1892448637638 72.12749516150578
 71.84447019046392 72.83319327613171 71.6321845942546 74.4942640712259
 77.55896721902043 74.7299679268912 76.48812762228415 74.5578916775045
 72.88142808613864 72.50542629942728 76.32749060807619 72.99727828616501
 74.3045842877977 74.11358683665732 74.46123608392212 75

In [227]:
# computing grades based on y_pred
grade = np.empty(len(y_pred), dtype = object)
Mean=np.mean(totPred)
sd=np.std(totPred)
print("SD:", sd)
print("Mean:", Mean)

for i in range(0,len(totPred)):
  if internal[i] < 30 and y_pred[i] < 40:
    grade[i] = 'F'
  else:
    if totPred[i]>= Mean + 1.5*sd:
      grade[i]='S'
    elif totPred[i]>=Mean + 0.5*sd and totPred[i] < Mean + 1.5*sd:
      grade[i]='A'
    elif totPred[i] >= Mean - 0.5*sd and totPred[i] < Mean + 0.5*sd:
      grade[i]='B'
    elif totPred[i]>= Mean - 1.0*sd and totPred[i] < Mean - 0.5*sd:
      grade[i]='C'
    elif totPred[i] >= Mean - 1.5*sd and totPred[i] < Mean - 1.0*sd:
      grade[i]='D'
    elif totPred[i] >= Mean - 2.0*sd and totPred[i] < Mean - 1.5*sd:
      grade[i]='E'
    elif totPred[i] < Mean - 2.0*sd:
      grade[i]='F'

print(grade)
print(np.unique(grade, return_counts=True))

SD: 10.43605066233296
Mean: 70.0088250939157
['E' 'F' 'F' 'E' 'F' 'E' 'F' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'C' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'A' 'B' 'A' 'B' 'B' 'B' 'A' 'B' 'B' 'B'
 'B' 'A' 'B' 'B' 'A' 'S' 'A' 'S' 'A' 'S' 'S' 'A' 'S' 'A' 'S' 'S']
(array(['A', 'B', 'C', 'E', 'F', 'S'], dtype=object), array([ 9, 36, 11,  3,  4,  7], dtype=int64))
