Importing the dependencies

In [62]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


Data collection and preprocessing

In [63]:
placement_data = pd.read_csv('Placement_Data_Full_Class.csv')

In [64]:
placement_data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [65]:
placement_data.shape

(215, 15)

In [66]:
placement_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


In [67]:
placement_data.describe()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [68]:
placement_data['status'].value_counts()

status
Placed        148
Not Placed     67
Name: count, dtype: int64

Converting the status column from categorical values to numeric
- Placed -> 1
- Not Placed -> 0

In [69]:
placement_data['status'].replace({'Placed' : 1, 'Not Placed' : 0}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  placement_data['status'].replace({'Placed' : 1, 'Not Placed' : 0}, inplace= True)
  placement_data['status'].replace({'Placed' : 1, 'Not Placed' : 0}, inplace= True)


In [70]:
placement_data['status'].astype(int)

0      1
1      1
2      1
3      0
4      1
      ..
210    1
211    1
212    1
213    1
214    0
Name: status, Length: 215, dtype: int64

The data of column status is highly unbalanced

In [71]:
# Separating the data for analysis
placed = placement_data[placement_data.status == 1]
not_placed = placement_data[placement_data.status == 0]

In [72]:
placed

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,1,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,1,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,1,250000.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,1,425000.0
7,8,M,82.00,Central,64.00,Central,Science,66.00,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,1,252000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,210,M,62.00,Central,72.00,Central,Commerce,65.00,Comm&Mgmt,No,67.0,Mkt&Fin,56.49,1,216000.0
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,1,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,1,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,1,295000.0


In [73]:
not_placed

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
3,4,M,56.0,Central,52.0,Central,Science,52.00,Sci&Tech,No,66.00,Mkt&HR,59.43,0,
5,6,M,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.00,Mkt&Fin,51.58,0,
6,7,F,46.0,Others,49.2,Others,Commerce,79.00,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,0,
9,10,M,58.0,Central,70.0,Central,Commerce,61.00,Comm&Mgmt,No,54.00,Mkt&Fin,52.21,0,
12,13,F,47.0,Central,55.0,Others,Science,65.00,Comm&Mgmt,No,62.00,Mkt&HR,65.04,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,199,F,67.0,Central,70.0,Central,Commerce,65.00,Others,No,88.00,Mkt&HR,71.96,0,
201,202,M,54.2,Central,63.0,Others,Science,58.00,Comm&Mgmt,No,79.00,Mkt&HR,58.44,0,
206,207,M,41.0,Central,42.0,Central,Science,60.00,Comm&Mgmt,No,97.00,Mkt&Fin,53.39,0,
208,209,F,43.0,Central,60.0,Others,Science,65.00,Comm&Mgmt,No,92.66,Mkt&HR,62.92,0,


Undersampling

In [74]:
placed_sampling = placed.sample(n=67)

Concatinating

In [75]:
new_placement_data = pd.concat([placed_sampling, not_placed], axis= 0)

In [76]:
new_placement_data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
162,163,M,74.2,Central,87.6,Others,Commerce,77.25,Comm&Mgmt,Yes,75.2,Mkt&Fin,66.06,1,285000.0
128,129,M,80.4,Central,73.4,Central,Science,77.72,Sci&Tech,Yes,81.2,Mkt&HR,76.26,1,400000.0
207,208,M,83.33,Central,78.0,Others,Commerce,61.0,Comm&Mgmt,Yes,88.56,Mkt&Fin,71.55,1,300000.0
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,1,270000.0
58,59,M,74.0,Central,62.0,Others,Science,68.0,Comm&Mgmt,No,74.0,Mkt&Fin,57.99,1,268000.0


In [77]:
new_placement_data['status'].value_counts()

status
1    67
0    67
Name: count, dtype: int64

In [78]:
# sl_no and salary is not that important, I am dropping it
new_placement_data.drop(columns= ['sl_no', 'salary'], axis= 1, inplace= True)

In [79]:
new_placement_data.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
162,M,74.2,Central,87.6,Others,Commerce,77.25,Comm&Mgmt,Yes,75.2,Mkt&Fin,66.06,1
128,M,80.4,Central,73.4,Central,Science,77.72,Sci&Tech,Yes,81.2,Mkt&HR,76.26,1
207,M,83.33,Central,78.0,Others,Commerce,61.0,Comm&Mgmt,Yes,88.56,Mkt&Fin,71.55,1
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,1
58,M,74.0,Central,62.0,Others,Science,68.0,Comm&Mgmt,No,74.0,Mkt&Fin,57.99,1


Categorical features to numeric value using Standard Scaler

In [80]:
encoder = LabelEncoder()

In [81]:
list_of_categorical_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']

In [82]:
for features in list_of_categorical_cols:
    print(new_placement_data[features].value_counts())

gender
M    84
F    50
Name: count, dtype: int64
ssc_b
Central    76
Others     58
Name: count, dtype: int64
hsc_b
Others     83
Central    51
Name: count, dtype: int64
hsc_s
Commerce    72
Science     54
Arts         8
Name: count, dtype: int64
degree_t
Comm&Mgmt    91
Sci&Tech     37
Others        6
Name: count, dtype: int64
workex
No     93
Yes    41
Name: count, dtype: int64
specialisation
Mkt&Fin    69
Mkt&HR     65
Name: count, dtype: int64


In [83]:
for features in list_of_categorical_cols:
    new_placement_data[features] = encoder.fit_transform(new_placement_data[features])

**gender**
- 0 - F
- 1 - M

**ssc_b**
- 0 - Central
- 1 - others

**hsc_b**
- 0 - Central
- 1 - others

**hsc_s**
- 0 - Arts
- 1 - Commerce
- 2 - Science

**degree_t**
- 0 - Comm&Mgmt
- 1 - Others
- 2 - Sci&Tech

**workex**
- 0 - No
- 1 - Yes

**specialisation**
- 0 - Mkt&Fin
- 1 - Mkt&HR

In [84]:
new_placement_data.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
162,1,74.2,0,87.6,1,1,77.25,0,1,75.2,0,66.06,1
128,1,80.4,0,73.4,0,2,77.72,2,1,81.2,1,76.26,1
207,1,83.33,0,78.0,1,1,61.0,0,1,88.56,0,71.55,1
0,1,67.0,1,91.0,1,1,58.0,2,0,55.0,1,58.8,1
58,1,74.0,0,62.0,1,2,68.0,0,0,74.0,0,57.99,1


In [85]:
new_placement_data.tail()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
198,0,67.0,0,70.0,0,1,65.0,1,0,88.0,1,71.96,0
201,1,54.2,0,63.0,1,2,58.0,0,0,79.0,1,58.44,0
206,1,41.0,0,42.0,0,2,60.0,0,0,97.0,0,53.39,0
208,0,43.0,0,60.0,1,2,65.0,0,0,92.66,1,62.92,0
214,1,62.0,0,58.0,1,2,53.0,0,0,89.0,1,60.22,0


# Splitting label and features


In [95]:
X = new_placement_data.drop(columns= 'status', axis= 1)
Y = new_placement_data['status']

## Train test split

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.2, random_state= 2)

In [97]:

models = {
    "Support Vector Classifier" : SVC(),
    "Logistic Regression" : LogisticRegression(),
    "Random Forest" : RandomForestClassifier(),
    "Decision Tree" : DecisionTreeClassifier(),
    "K Neighbors" : KNeighborsClassifier(),
    "Gaussian" : GaussianNB()
}

# Evaluate each model
model_accuracies = {}

for model_name, model_instance in models.items():
    model_instance.fit(X_train, Y_train)
    y_prediction = model_instance.predict(X_test)
    accuracy = accuracy_score(Y_test, y_prediction)
    model_accuracies[model_name] = accuracy

for model_name, accuracy in model_accuracies.items():
    print(f"Accuracy Score → {model_name} is {accuracy} ")

Accuracy Score → Support Vector Classifier is 0.8148148148148148 
Accuracy Score → Logistic Regression is 0.8148148148148148 
Accuracy Score → Random Forest is 0.7777777777777778 
Accuracy Score → Decision Tree is 0.7777777777777778 
Accuracy Score → K Neighbors is 0.8148148148148148 
Accuracy Score → Gaussian is 0.8148148148148148 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [98]:
# Finding the best model
best_model_name = max(model_accuracies, key= model_accuracies.get)
best_model_accuracy = model_accuracies[best_model_name]

In [99]:
print(f"Best Model: {best_model_name} with Accuracy: {best_model_accuracy}")

Best Model: Support Vector Classifier with Accuracy: 0.8148148148148148


In [100]:
best_model = models[best_model_name]
Y_pred = best_model.predict(X_test)
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           1       0.82      0.88      0.85        16

    accuracy                           0.81        27
   macro avg       0.81      0.80      0.81        27
weighted avg       0.81      0.81      0.81        27



In [104]:
input = (1,67.00,1,91.00,1,1,58.00,1,0,55,1,58.8)
inp_as_array = np.asarray(input)
reshaped_input = inp_as_array.reshape(1, -1)
prediction = best_model.predict(reshaped_input)
print(prediction[0])

1




In [105]:
Placement_Data = 'PlacementModel1.sav'
pickle.dump(best_model, open('PlacementModel1.sav', 'wb'))

In [106]:
loaded_model = pickle.load(open('PlacementModel1.sav', 'rb'))