In [100]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [231]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [102]:
df = pd.read_csv('Job_Placement_Data (1).csv')

In [103]:
df.sample(10)

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,hsc_subject,degree_percentage,undergrad_degree,work_experience,emp_test_percentage,specialisation,mba_percent,status
144,M,52.0,Others,50.0,Others,Arts,61.0,Comm&Mgmt,No,60.0,Mkt&Fin,58.52,Not Placed
213,F,74.0,Others,66.0,Others,Commerce,58.0,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed
187,M,78.5,Central,65.5,Central,Science,67.0,Sci&Tech,Yes,95.0,Mkt&Fin,64.86,Placed
79,F,69.0,Central,62.0,Central,Science,66.0,Sci&Tech,No,75.0,Mkt&HR,67.99,Not Placed
195,M,66.0,Central,76.0,Central,Commerce,72.0,Comm&Mgmt,Yes,84.0,Mkt&HR,58.95,Placed
171,M,80.0,Others,80.0,Others,Commerce,72.0,Comm&Mgmt,Yes,63.79,Mkt&Fin,66.04,Placed
210,M,80.6,Others,82.0,Others,Commerce,77.6,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed
89,F,84.0,Others,75.0,Others,Science,69.0,Sci&Tech,Yes,62.0,Mkt&HR,62.36,Placed
39,M,81.0,Others,68.0,Others,Science,64.0,Sci&Tech,No,93.0,Mkt&Fin,62.56,Placed
209,M,62.0,Central,72.0,Central,Commerce,65.0,Comm&Mgmt,No,67.0,Mkt&Fin,56.49,Placed


In [104]:
df.isnull().sum()

gender                 0
ssc_percentage         0
ssc_board              0
hsc_percentage         0
hsc_board              0
hsc_subject            0
degree_percentage      0
undergrad_degree       0
work_experience        0
emp_test_percentage    0
specialisation         0
mba_percent            0
status                 0
dtype: int64

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               215 non-null    object 
 1   ssc_percentage       215 non-null    float64
 2   ssc_board            215 non-null    object 
 3   hsc_percentage       215 non-null    float64
 4   hsc_board            215 non-null    object 
 5   hsc_subject          215 non-null    object 
 6   degree_percentage    215 non-null    float64
 7   undergrad_degree     215 non-null    object 
 8   work_experience      215 non-null    object 
 9   emp_test_percentage  215 non-null    float64
 10  specialisation       215 non-null    object 
 11  mba_percent          215 non-null    float64
 12  status               215 non-null    object 
dtypes: float64(5), object(8)
memory usage: 22.0+ KB


In [106]:
df.gender.value_counts()

gender
M    139
F     76
Name: count, dtype: int64

In [107]:
df.ssc_board.value_counts()

ssc_board
Central    116
Others      99
Name: count, dtype: int64

In [108]:
df.hsc_board.value_counts()

hsc_board
Others     131
Central     84
Name: count, dtype: int64

In [109]:
df.hsc_subject.value_counts()

hsc_subject
Commerce    113
Science      91
Arts         11
Name: count, dtype: int64

In [110]:
df.undergrad_degree.value_counts()

undergrad_degree
Comm&Mgmt    145
Sci&Tech      59
Others        11
Name: count, dtype: int64

In [111]:
df.work_experience.value_counts()

work_experience
No     141
Yes     74
Name: count, dtype: int64

In [112]:
df.specialisation.value_counts()

specialisation
Mkt&Fin    120
Mkt&HR      95
Name: count, dtype: int64

In [125]:
# train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df.drop(columns=['status']),df['status'] , test_size=0.2 , random_state=42)

In [126]:
X_train.shape

(172, 12)

In [127]:
X_test.shape

(43, 12)

In [128]:
print(df.iloc[:, [2, 4, 8, 12]].nunique())


ssc_board          2
hsc_board          2
work_experience    2
status             2
dtype: int64


# making pipeline

In [159]:
# step 1 => encoding catogarical data using OneHotEncoder and OrdinalEncoder
tnf1 = ColumnTransformer([
    ('ohe_upg' , OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first') ,[0,5,7,10]),
    ('oe_upg' , OrdinalEncoder(categories=[['Others','Central'],['Others','Central'],['No','Yes']]),[2,4,8]),   
],remainder='passthrough')

In [210]:
# step 2 => scaling data using MinMaxScaler for using data in SelectKBest
tnf2 = ColumnTransformer([
  ('scale' ,MinMaxScaler(),slice(10,15))   
],remainder='passthrough')

In [240]:
# step 3 => Selecting 6 best columns using SelectKBest
tnf3 = SelectKBest(score_func=chi2,k=6)

In [241]:
# step 4 => training dataset using DecisionTreeClassifier
tnf4 =  DecisionTreeClassifier()

In [242]:
pipe = Pipeline([
    ('tnf1',tnf1),
    ('tnf2',tnf2),
    ('tnf3',tnf3),
    ('tnf4',tnf4)
])

In [243]:
Y_train.head()

93     Not Placed
84         Placed
95         Placed
137        Placed
210        Placed
Name: status, dtype: object

# fiting dataset

In [266]:
pipe.fit(X_train ,Y_train)

# predicting dataset

In [250]:
Y_pred = pipe.predict(X_test)

In [247]:
Y_pred


array(['Placed', 'Placed', 'Placed', 'Not Placed', 'Placed', 'Not Placed',
       'Not Placed', 'Placed', 'Placed', 'Placed', 'Placed', 'Placed',
       'Not Placed', 'Not Placed', 'Placed', 'Not Placed', 'Placed',
       'Not Placed', 'Not Placed', 'Placed', 'Placed', 'Placed', 'Placed',
       'Not Placed', 'Placed', 'Placed', 'Placed', 'Placed', 'Placed',
       'Placed', 'Placed', 'Not Placed', 'Not Placed', 'Placed', 'Placed',
       'Placed', 'Not Placed', 'Placed', 'Placed', 'Placed', 'Placed',
       'Placed', 'Placed'], dtype=object)

# Checking Accuracy

In [265]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [253]:
accuracy_score(Y_test,Y_pred)

0.813953488372093

In [258]:
# Example: Suppose y_test and y_pred are given
cm = confusion_matrix(Y_test, Y_pred)
TN, FP, FN, TP = cm.ravel()  # Extracting values

In [259]:
# Accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)
print("Accuracy:", accuracy)

Accuracy: 0.813953488372093


In [260]:
# Precision (Positive Predictive Value)
precision = TP / (TP + FP)
print("Precision:", precision)

Precision: 0.8709677419354839


In [261]:
# Recall (Sensitivity or True Positive Rate)
recall = TP / (TP + FN)
print("Recall:", recall)

Recall: 0.8709677419354839


In [262]:
# F1-Score (Harmonic mean of Precision and Recall)
f1_score = 2 * (precision * recall) / (precision + recall)
print("F1-Score:", f1_score)

F1-Score: 0.8709677419354839
