In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("student_records.csv")
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [3]:
#get features and corresponding outcomes
#features:what we need to observe
#labels:what we want to predict

feature_names=["OverallGrade", "Obedient", "ResearchScore", "ProjectScore",] 
training_features=df[feature_names]
print(training_features)

  OverallGrade Obedient  ResearchScore  ProjectScore
0            A        Y             90            85
1            C        N             85            51
2            F        N             10            17
3            B        Y             75            71
4            E        N             20            30
5            A        Y             92            79
6            B        Y             60            59
7            C        Y             75            33


In [4]:
outcome_name=["Recommend"]
outcome_labels=df[outcome_name]
print(outcome_labels)

  Recommend
0       Yes
1       Yes
2        No
3        No
4        No
5       Yes
6        No
7        No


In [5]:
#list down features based on type

numeric_feature_names=["ResearchScore", "ProjectScore"]
categorical_feature_names=["OverallGrade", "Obedient"]

In [6]:
#to supress warnings

pd.options.mode.chained_assignment = None

In [7]:
#scale or normalise the two numeric attributes

from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

In [8]:
#fit scaler or numeric features
#fit method learns the range of the data, find mean and std_dev

ss.fit(training_features[numeric_feature_names])

StandardScaler()

In [9]:
#scale numeric features
#transform method transforms the data to a much reduced range

training_features[numeric_feature_names]=ss.transform(training_features[numeric_feature_names])

In [10]:
print(training_features)

  OverallGrade Obedient  ResearchScore  ProjectScore
0            A        Y       0.899583      1.376650
1            C        N       0.730648     -0.091777
2            F        N      -1.803390     -1.560203
3            B        Y       0.392776      0.772004
4            E        N      -1.465519     -0.998746
5            A        Y       0.967158      1.117516
6            B        Y      -0.114032      0.253735
7            C        Y       0.392776     -0.869179


In [11]:
##Rule for scaling 
## 1. SS can be applied in almost all cases
## 2. If data has outliers then make sure u don't use MMS
## 3. If data has outliers then prefer Robust Scalar.

In [12]:
#Engineering Categorical Features
training_features=pd.get_dummies(training_features, columns=categorical_feature_names)

# view new engineering features, where the categorical features are coded as binary
print(training_features)

# We have converted our categoricial data into numeric. 
# or we can say we have done feature engineering over categorical data.

   ResearchScore  ProjectScore  OverallGrade_A  OverallGrade_B  \
0       0.899583      1.376650               1               0   
1       0.730648     -0.091777               0               0   
2      -1.803390     -1.560203               0               0   
3       0.392776      0.772004               0               1   
4      -1.465519     -0.998746               0               0   
5       0.967158      1.117516               1               0   
6      -0.114032      0.253735               0               1   
7       0.392776     -0.869179               0               0   

   OverallGrade_C  OverallGrade_E  OverallGrade_F  Obedient_N  Obedient_Y  
0               0               0               0           0           1  
1               1               0               0           1           0  
2               0               0               1           1           0  
3               0               0               0           0           1  
4               0        

In [13]:
categorical_engineered_features = list(set(training_features.columns) 
                                       - set(numeric_feature_names))

print(categorical_engineered_features)

['Obedient_N', 'OverallGrade_C', 'Obedient_Y', 'OverallGrade_A', 'OverallGrade_F', 'OverallGrade_B', 'OverallGrade_E']


In [15]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import warnings; warnings.simplefilter("ignore")


In [16]:
lr=LogisticRegression()

In [18]:
model=lr.fit(training_features,outcome_labels["Recommend"])
model

LogisticRegression()

In [19]:
#sample data to check

new_data = pd.DataFrame([{'Name': 'Ninad', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 10, 'ProjectScore': 20},
                  {'Name': 'Alxis', 'OverallGrade': 'B', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}, 
                  {'Name': 'Faiz', 'OverallGrade': 'C', 'Obedient': 'N', 'ResearchScore': 69, 'ProjectScore': 70}, 
                  {'Name': 'Sejal', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 98, 'ProjectScore': 88},
                  {'Name': 'Vijan', 'OverallGrade': 'E', 'Obedient': 'N', 'ResearchScore': 28, 'ProjectScore': 30}])

print(new_data)


    Name OverallGrade Obedient  ResearchScore  ProjectScore
0  Ninad            F        N             10            20
1  Alxis            B        Y             78            80
2   Faiz            C        N             69            70
3  Sejal            A        Y             98            88
4  Vijan            E        N             28            30


In [22]:
#data preparation
prediction_features=new_data[feature_names]

#scaling using standardScalar object
prediction_features[numeric_feature_names]=ss.transform(prediction_features[numeric_feature_names])

prediction_features = pd.get_dummies(prediction_features, 
                                     columns=categorical_feature_names)

print(prediction_features)
print(prediction_features.columns)

   ResearchScore  ProjectScore  OverallGrade_A  OverallGrade_B  \
0      -1.803390     -1.430636               0               0   
1       0.494137      1.160705               0               1   
2       0.190053      0.728815               0               0   
3       1.169881      1.506217               1               0   
4      -1.195221     -0.998746               0               0   

   OverallGrade_C  OverallGrade_E  OverallGrade_F  Obedient_N  Obedient_Y  
0               0               0               1           1           0  
1               0               0               0           0           1  
2               1               0               0           1           0  
3               0               0               0           0           1  
4               0               1               0           1           0  
Index(['ResearchScore', 'ProjectScore', 'OverallGrade_A', 'OverallGrade_B',
       'OverallGrade_C', 'OverallGrade_E', 'OverallGrade_F', 'Obedient_

In [23]:
predictions = model.predict(prediction_features)

new_data['Recommend'] = predictions
print(new_data)

    Name OverallGrade Obedient  ResearchScore  ProjectScore Recommend
0  Ninad            F        N             10            20        No
1  Alxis            B        Y             78            80        No
2   Faiz            C        N             69            70       Yes
3  Sejal            A        Y             98            88       Yes
4  Vijan            E        N             28            30        No
