# New student records with model predictions for grant recommendation
# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("C:/Users/ITQLAP/OneDrive/ML1 PROJECT/New folder/student_records.csv")
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [3]:
feature_names = ['OverallGrade', 'Obedient', 'ResearchScore','ProjectScore']
training_features = df[feature_names]
outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [4]:
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [5]:
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [6]:
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categoricial_feature_names = ['OverallGrade', 'Obedient']

In [7]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [8]:
ss.fit(training_features[numeric_feature_names])

StandardScaler()

In [9]:
training_features[numeric_feature_names] =ss.transform(training_features[numeric_feature_names])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_features[numeric_feature_names] =ss.transform(training_features[numeric_feature_names])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [10]:
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


In [11]:
training_features = pd.get_dummies(training_features,columns=categoricial_feature_names )
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [12]:
# get list of new numerical features
categorical_engineered_features = list(set(training_features.columns) -set(numeric_feature_names))
categorical_engineered_features

['OverallGrade_B',
 'OverallGrade_A',
 'OverallGrade_F',
 'Obedient_N',
 'Obedient_Y',
 'OverallGrade_C',
 'OverallGrade_E']

# Modeling

In [16]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()
model = logr.fit(training_features,np.array(outcome_labels['Recommend']))
model

LogisticRegression()

# Model Evaluation

In [17]:
# simple evaluation on training data
pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

In [19]:
# Evaluate the model performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [20]:
print('Accuracy:', float(accuracy_score(actual_labels,pred_labels))*100, '%')
print('Classification Stats:')
print(classification_report(actual_labels, pred_labels))

Accuracy: 100.0 %
Classification Stats:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



# Model Deployment

In [22]:
! pip install joblib



In [27]:
# load model and scaler object
import joblib
import os

In [28]:
# save models to be deployed on your server
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Scaler'):
    os.mkdir('Scaler')
    
joblib.dump(model, r'Model/model.pickle')
joblib.dump(ss, r'Scaler/scaler.pickle')

['Scaler/scaler.pickle']

# Use of model deployment

In [29]:
new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F','Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},
{'Name': 'Thomas', 'OverallGrade': 'A','Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])

new_data = new_data[['Name', 'OverallGrade', 'Obedient',
'ResearchScore', 'ProjectScore']]
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Nathan,F,N,30,20
1,Thomas,A,Y,78,80


In [30]:
# data preparation
prediction_features = new_data[feature_names]

In [34]:
#scaling
prediction_features[numeric_feature_names] =ss.transform(prediction_features[numeric_feature_names])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_features[numeric_feature_names] =ss.transform(prediction_features[numeric_feature_names])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [35]:
# engineering categorical variables
prediction_features = pd.get_dummies(prediction_features,columns=categoricial_feature_names)

In [36]:
# view feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


We now have the relevant features for the new students! However you can see that some of the
categorical features are missing based on some grades like B, C, and E. This is because none of these
students obtained those grades but we still need those attributes because the model was trained on all
attributes including these. The following snippet helps us identify and add the missing categorical features.
We add the value for each of those features as 0 for each student since they did not obtain those grades.

In [37]:
# add missing categorical feature columns
current_categorical_engineered_features =set(prediction_features.columns) - set(numeric_feature_names)
missing_features = set(categorical_engineered_features) -current_categorical_engineered_features
for feature in missing_features:
 # add zeros since feature is absent in these data samples
    prediction_features[feature] = [0] * len(prediction_features)

# view final feature set
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_E,OverallGrade_B,OverallGrade_C
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


In [38]:
# predict the model
predictions = model.predict(prediction_features)

## display results
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Nathan,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes


# The END