In [1]:
import seaborn as sns
import matplotlib.pyplot as plt 
import pandas as pd

In [2]:
df = pd.read_csv('student_pass_fail_imbalanced_1000.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   S1      1000 non-null   int64 
 1   S2      1000 non-null   int64 
 2   S3      1000 non-null   int64 
 3   S4      1000 non-null   int64 
 4   S5      1000 non-null   int64 
 5   Result  1000 non-null   object
dtypes: int64(5), object(1)
memory usage: 47.0+ KB


In [4]:
df.isnull().sum()   

S1        0
S2        0
S3        0
S4        0
S5        0
Result    0
dtype: int64

In [5]:
df.head()

Unnamed: 0,S1,S2,S3,S4,S5,Result
0,51,56,62,95,7,F
1,92,16,85,63,25,F
2,14,85,1,97,70,F
3,71,89,87,88,29,F
4,60,43,71,9,6,F


In [6]:
df['Result'].replace({'P': 1, 'F': 0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Result'].replace({'P': 1, 'F': 0}, inplace=True)
  df['Result'].replace({'P': 1, 'F': 0}, inplace=True)


In [7]:
df.head()

Unnamed: 0,S1,S2,S3,S4,S5,Result
0,51,56,62,95,7,0
1,92,16,85,63,25,0
2,14,85,1,97,70,0
3,71,89,87,88,29,0
4,60,43,71,9,6,0


In [8]:
from sklearn.model_selection import train_test_split
X = df.drop('Result', axis=1)
y = df['Result']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
y_test

521    0
737    0
740    0
660    0
411    0
      ..
468    0
935    0
428    0
7      0
155    0
Name: Result, Length: 300, dtype: int64

In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
    
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       291
           1       1.00      0.44      0.62         9

    accuracy                           0.98       300
   macro avg       0.99      0.72      0.80       300
weighted avg       0.98      0.98      0.98       300

[[291   0]
 [  5   4]]


In [13]:
model1 = LogisticRegression(class_weight='balanced')
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       291
           1       1.00      0.44      0.62         9

    accuracy                           0.98       300
   macro avg       0.99      0.72      0.80       300
weighted avg       0.98      0.98      0.98       300

[[291   0]
 [  5   4]]


In [15]:
import joblib
joblib.dump(model1, 'logistic_regression_model_balanced.pkl')

['logistic_regression_model_balanced.pkl']

In [16]:
model_loaded = joblib.load('logistic_regression_model_balanced.pkl')

In [20]:
def pass_fail(x):
    if x == 1:
        return 'Pass'
    else:
        return 'Fail'

In [34]:
input_data = [[50,64,77,49,60]]
prediction = model_loaded.predict(input_data)





In [35]:
result = pass_fail(prediction[0])
print("The student is predicted to:", result)

The student is predicted to: Fail


In [27]:
prediction[0]

1