**IMPORT THE NECESSARY LIBRARIES**

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers.default = "iframe"

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

import pickle

In [2]:
df = pd.read_csv('/content/collegePlace.csv')

**UNDERSTANDING THE DATA**

In [3]:
df.shape

(2966, 8)

In [4]:
df.head()

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,Male,Electronics And Communication,1,8,1,1,1
1,21,Female,Computer Science,0,7,1,1,1
2,22,Female,Information Technology,1,6,0,0,1
3,21,Male,Information Technology,0,8,0,1,1
4,22,Male,Mechanical,0,8,1,0,1


In [5]:
df.tail()

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
2961,23,Male,Information Technology,0,7,0,0,0
2962,23,Male,Mechanical,1,7,1,0,0
2963,22,Male,Information Technology,1,7,0,0,0
2964,22,Male,Computer Science,1,7,0,0,0
2965,23,Male,Civil,0,8,0,0,1


In [6]:
df.sample(9)

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
1864,21,Male,Information Technology,1,7,1,0,0
579,22,Male,Computer Science,0,6,0,0,0
80,21,Male,Civil,0,9,0,0,1
2255,19,Male,Civil,1,6,0,0,1
17,21,Male,Civil,0,6,0,0,0
661,22,Male,Computer Science,0,7,0,1,0
637,21,Male,Computer Science,2,6,0,0,1
355,22,Male,Electrical,0,6,0,1,0
366,22,Male,Computer Science,1,6,0,0,0


In [7]:
df.dtypes

Unnamed: 0,0
Age,int64
Gender,object
Stream,object
Internships,int64
CGPA,int64
Hostel,int64
HistoryOfBacklogs,int64
PlacedOrNot,int64


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                2966 non-null   int64 
 1   Gender             2966 non-null   object
 2   Stream             2966 non-null   object
 3   Internships        2966 non-null   int64 
 4   CGPA               2966 non-null   int64 
 5   Hostel             2966 non-null   int64 
 6   HistoryOfBacklogs  2966 non-null   int64 
 7   PlacedOrNot        2966 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 185.5+ KB


**MATHEMATICAL DESCRIPTION OF DATA**

In [9]:
df.describe()

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
count,2966.0,2966.0,2966.0,2966.0,2966.0,2966.0
mean,21.48584,0.703641,7.073837,0.269049,0.192178,0.552596
std,1.324933,0.740197,0.967748,0.44354,0.394079,0.49731
min,19.0,0.0,5.0,0.0,0.0,0.0
25%,21.0,0.0,6.0,0.0,0.0,0.0
50%,21.0,1.0,7.0,0.0,0.0,1.0
75%,22.0,1.0,8.0,1.0,0.0,1.0
max,30.0,3.0,9.0,1.0,1.0,1.0


**PREPROCESSING, EDA AND FEATURE VISUALISATION**

**1) PREPROCESSING**

In [10]:
df.isnull().sum()


Unnamed: 0,0
Age,0
Gender,0
Stream,0
Internships,0
CGPA,0
Hostel,0
HistoryOfBacklogs,0
PlacedOrNot,0


**no null values**

In [11]:
# duplicate rows
print(df.duplicated().sum())

#drop duplicates
df.drop_duplicates(inplace=True)

1829


In [12]:
print(df.duplicated().sum())

0


**EDA**

**OUTPUT VISUALISED WRT TO TWO MAJOR FEATURES**

In [14]:
from IPython.display import HTML

fig = px.scatter(df, x="CGPA", y="Internships", color="PlacedOrNot",hover_data=['CGPA'])
HTML(fig.to_html(include_plotlyjs='cdn'))

**COUNT OF PLACED AND NOT**

In [15]:
fig = px.histogram(df, x='PlacedOrNot', color='PlacedOrNot', barmode='group')
HTML(fig.to_html(include_plotlyjs='cdn'))

In [16]:
# Pie Chart
fig = px.pie(df, values=df['PlacedOrNot'].value_counts().values, names=df['PlacedOrNot'].value_counts().index, title='Placed Vs Not Placed')
HTML(fig.to_html(include_plotlyjs='cdn'))

**MAX AND MIN AGE OF PERSON WHO IS PLACED**

In [17]:
print("Max Age of Placed Person: ",df[(df['Age'] == df['Age'].max()) & (df['PlacedOrNot']==1)]['Age'].values[0])
print("Min Age of Placed Person: ",df[(df['Age'] == df['Age'].min()) & (df['PlacedOrNot']==1)]['Age'].values[0])

Max Age of Placed Person:  30
Min Age of Placed Person:  19


**MAX AND MIN INTERNSHIPS DONE BY STUDENT WHO IS PLACED**

In [18]:
print("Max Internships Done by the Placed Student: ",df[(df['Internships'] == df['Internships'].max()) & (df['PlacedOrNot']==1)]['Internships'].values[0])
print("No of students who did max Internships and are placed: ",df[(df['Internships'] == df['Internships'].max()) & (df['PlacedOrNot']==1)]['Internships'].value_counts().values[0])

print("Min Internships Done by the Placed Person: ",df[(df['Internships'] == df['Internships'].min()) & (df['PlacedOrNot']==1)]['Internships'].values[0])
print("No of students who did min Internships and are placed: ",df[(df['Internships'] == df['Internships'].min()) & (df['PlacedOrNot']==1)]['Internships'].value_counts().values[0])

Max Internships Done by the Placed Student:  3
No of students who did max Internships and are placed:  35
Min Internships Done by the Placed Person:  0
No of students who did min Internships and are placed:  232


**MAX AND MIN CGPA OF STUDENT WHO IS PLACED**

In [19]:
print("Max CGPA of Placed Student: ",df[(df['CGPA'] == df['CGPA'].max()) & (df['PlacedOrNot']==1)]['CGPA'].values[0])
print("No of students has max CGPA and are placed: ",df[(df['CGPA'] == df['CGPA'].max()) & (df['PlacedOrNot']==1)]['CGPA'].value_counts().values[0])

print("Min CGPA of Placed Person: ",df[(df['CGPA'] == df['CGPA'].min()) & (df['PlacedOrNot']==1)]['CGPA'].values[0])
print("No of students has min CGPA and are placed: ",df[(df['CGPA'] == df['CGPA'].min()) & (df['PlacedOrNot']==1)]['CGPA'].value_counts().values[0])

Max CGPA of Placed Student:  9
No of students has max CGPA and are placed:  99
Min CGPA of Placed Person:  5
No of students has min CGPA and are placed:  5


**STATISTICAL REPRESENTATION FOR MATHEMATICAL UNDERSTANDING**

In [20]:
fig = px.box(df, y='CGPA')
HTML(fig.to_html(include_plotlyjs='cdn'))

In [21]:
fig = px.box(df, y='Age')
HTML(fig.to_html(include_plotlyjs='cdn'))

In [22]:
fig = px.box(df, y=['Internships','CGPA', 'Age'])
HTML(fig.to_html(include_plotlyjs='cdn'))

**CATEGORICAL TO NUMERICAL DATA**

In [23]:
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

In [24]:
df['Stream'].unique()

array(['Electronics And Communication', 'Computer Science',
       'Information Technology', 'Mechanical', 'Electrical', 'Civil'],
      dtype=object)

In [25]:
# convert Stream column to numeric
df['Stream'] = df['Stream'].map({'Electronics And Communication': 1,
                                 'Computer Science': 2,
                                'Information Technology': 3,
                                'Mechanical':4,
                                'Electrical':5,
                                'Civil':6})

In [26]:
df.sample(5)

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
1870,20,0,5,1,7,0,1,1
1027,22,0,3,1,7,0,1,1
2849,23,1,1,1,8,1,1,1
2869,23,1,6,2,8,0,0,1
901,21,0,3,1,6,1,0,0


**INPUT AND OUTPUT COLUMNS**

In [27]:
X = df.iloc[:,0:7]
y = df.iloc[:,-1]
X

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs
0,22,1,1,1,8,1,1
1,21,0,2,0,7,1,1
2,22,0,3,1,6,0,0
3,21,1,3,0,8,0,1
4,22,1,4,0,8,1,0
...,...,...,...,...,...,...,...
2946,23,1,3,1,7,1,1
2952,23,1,4,0,8,1,0
2954,23,0,2,1,8,0,1
2958,23,1,2,0,6,0,1


In [28]:
print(X.shape)
print(y.shape)

(1137, 7)
(1137,)


**TRAIN TEST SPLIT**

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(761, 7)
(376, 7)
(761,)
(376,)


**SCALING**

In [31]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

**LOGISTIC REGRESSION**

In [32]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7420212765957447
Without Scaling and With CV:  0.7123034859876964
With Scaling and Without CV:  0.7420212765957447
With Scaling and With CV:  0.7123034859876964


**SGD Classifier**

In [33]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(max_iter=1000, tol=1e-3)

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.5292553191489362
Without Scaling and With CV:  0.6479494190020504
With Scaling and Without CV:  0.7367021276595744
With Scaling and With CV:  0.7121838687628161


In [34]:
from sklearn.linear_model import Perceptron
# this is same as SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)

clf = Perceptron(tol=1e-3, random_state=0)
# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.5957446808510638
Without Scaling and With CV:  0.6229323308270677
With Scaling and Without CV:  0.5851063829787234
With Scaling and With CV:  0.6478127136021874


In [35]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0)

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())


lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Without Scaling and CV:  0.7446808510638298



lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Without Scaling and With CV:  0.7057245386192755
With Scaling and Without CV:  0.7393617021276596
With Scaling and With CV:  0.709671907040328


**DECISION TREE CLASSIFIER**

In [36]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)

#without scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7420212765957447
Without Scaling and With CV:  0.7569548872180452
With Scaling and Without CV:  0.7446808510638298
With Scaling and With CV:  0.755622009569378


**RANDOM FOREST CLASSIFIER**

In [37]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=10, random_state=0)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.773936170212766
Without Scaling and With CV:  0.7871667805878333
With Scaling and Without CV:  0.7792553191489362
With Scaling and With CV:  0.7884825700615175


**SVM**

In [38]:
from sklearn.svm import SVC
#clf = SVC(gamma='auto')

svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(svc, parameters)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.7332989336085312
Without Scaling and CV:  0.7606382978723404
Without Scaling and With CV:  0.7254101161995898
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Score: 0.7858531131750945
With Scaling and Without CV:  0.7792553191489362
With Scaling and With CV:  0.7805707450444291


In [39]:
from sklearn.svm import NuSVC
clf = NuSVC()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7606382978723404
Without Scaling and With CV:  0.7766233766233768
With Scaling and Without CV:  0.7819148936170213
With Scaling and With CV:  0.7779733424470268


**NAIVE BAYES**

In [40]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

#y_pred = gnb.fit(X_train, y_train).predict(X_test)
#print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.75
Without Scaling and With CV:  0.7293062200956937
With Scaling and Without CV:  0.75
With Scaling and With CV:  0.7293062200956937


In [41]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.6462765957446809
Without Scaling and With CV:  0.6241626794258373


In [42]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.5851063829787234
Without Scaling and With CV:  0.5637730690362269
With Scaling and Without CV:  0.6914893617021277
With Scaling and With CV:  0.658339029391661


**KNN**

In [43]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())


# With Scaling
clf.fit(X_train_scale,y_train)
y_pred = clf.predict(X_test_scale)
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train_scale, y_train, cv=10)
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7180851063829787
Without Scaling and With CV:  0.7489405331510596
With Scaling and Without CV:  0.675531914893617
With Scaling and With CV:  0.7188824333561177


**MODEL SELECTION - RANDOM FOREST PERFORMED WELL**

In [44]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("With CV: ",scores.mean())
print("Precision Score: ", precision_score(y_test, y_pred))
print("Recall Score: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))

Without CV:  0.773936170212766
With CV:  0.7871667805878333
Precision Score:  0.8601036269430051
Recall Score:  0.7410714285714286
F1 Score:  0.7961630695443646


**TUNING THE MODEL**



Using Hyper-Parameter tuning using GridsearchCV Hypertune the parameters for Random forest and get best parameters

In [45]:
param_grid = {
    'bootstrap': [False,True],
    'max_depth': [5,8,10, 20],
    'max_features': [3, 4, 5, None],
    'min_samples_split': [2, 10, 12],
    'n_estimators': [100, 200, 300]
}

rfc = RandomForestClassifier()

clf = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 1)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print(clf.best_params_)
print(clf.best_estimator_)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Accuracy:  0.8111702127659575
{'bootstrap': True, 'max_depth': 5, 'max_features': 5, 'min_samples_split': 2, 'n_estimators': 300}
RandomForestClassifier(max_depth=5, max_features=5, n_estimators=300)


**Training the final model with best parameters:**

In [46]:
clf = RandomForestClassifier(bootstrap=False, max_depth=5,max_features=None,
                             min_samples_split=2,
                             n_estimators=100, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("With CV: ",scores.mean())
print("Precision Score: ", precision_score(y_test, y_pred))
print("Recall Score: ", recall_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))

Without CV:  0.8164893617021277
With CV:  0.8108509911141489
Precision Score:  0.9428571428571428
Recall Score:  0.7366071428571429
F1 Score:  0.8270676691729323


**Deploy the model**




Dump the model

In [47]:
pickle.dump(clf,open('model.pkl','wb'))

LOAD THE MODEL

In [48]:
model = pickle.load(open('model.pkl','rb'))

MAKE PREDICTIONS

In [50]:
age = 22
gender = 0 # 1=Male, 0=Female
stream = 2  # Electronics And Communication': 1,
#              'Computer Science': 2,
#              'Information Technology': 3,
#              'Mechanical':4,
#              'Electrical':5,
#              'Civil':6
Internships = 5
CGPA = 9
Hostel = 1 # 1= stay in hostel, 0=not staying in hostel
HistoryOfBacklogs = 1 # 1 = had backlogs, 0=no backlogs

prediction = clf.predict([[age,gender,stream,Internships,CGPA,Hostel,HistoryOfBacklogs]])
prediction


X does not have valid feature names, but RandomForestClassifier was fitted with feature names



array([1])

In [51]:
!pip install gradio joblib

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

**CREATING A GRADIO INTERFACE FOR GLOBAL USAGE**

In [52]:
import gradio as gr
import joblib

# Load your saved model
clf = joblib.load("model.pkl")

# Label encodings (same as your model's training)
def encode_inputs(age, gender, stream, internships, cgpa, hostel, backlogs):
    gender = 1 if gender == "Male" else 0
    stream_map = {
        "Electronics And Communication": 1,
        "Computer Science": 2,
        "Information Technology": 3,
        "Mechanical": 4,
        "Electrical": 5,
        "Civil": 6
    }
    stream = stream_map[stream]
    hostel = 1 if hostel == "Yes" else 0
    backlogs = 1 if backlogs == "Yes" else 0
    return [age, gender, stream, internships, cgpa, hostel, backlogs]

# Prediction function
def predict_placement(age, gender, stream, internships, cgpa, hostel, backlogs):
    features = encode_inputs(age, gender, stream, internships, cgpa, hostel, backlogs)
    pred = clf.predict([features])[0]
    return "🎓 Placed ✅" if pred == 1 else "❌ Not Placed"

In [53]:
ui = gr.Interface(
    fn=predict_placement,
    inputs=[
        gr.Slider(18, 30, value=22, label="Age"),
        gr.Radio(["Male", "Female"], label="Gender"),
        gr.Dropdown([
            "Electronics And Communication", "Computer Science",
            "Information Technology", "Mechanical", "Electrical", "Civil"
        ], label="Stream"),
        gr.Slider(0, 10, value=0, label="Internships"),
        gr.Slider(0, 10, value=0, label="CGPA"),
        gr.Radio(["Yes", "No"], label="Hostel"),
        gr.Radio(["Yes", "No"], label="History of Backlogs")
    ],
    outputs=gr.Textbox(label="Placement Prediction"),
    title="🎓 Placement Predictor",
    description="Enter student details to predict if they are likely to get placed."
)

ui.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ff1e1e867c5a666e8a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


