In [3]:
import numpy as np  
import pandas as pd  
  
import matplotlib.pyplot as plt  
import plotly.express as px  
from plotly.offline import init_notebook_mode, iplot  
init_notebook_mode(connected=True)  
  
from sklearn.decomposition import PCA  
  
from sklearn. preprocessing import StandardScaler  
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import cross_val_score  
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  
  
from sklearn.metrics import accuracy_score  
from sklearn.metrics import precision_score, recall_score, f1_score  
  
import pickle  

In [4]:
dataframe = pd.read_csv('collegePlace.csv')

In [5]:
dataframe.shape

(2966, 8)

In [6]:
dataframe.head()

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,Male,Electronics And Communication,1,8,1,1,1
1,21,Female,Computer Science,0,7,1,1,1
2,22,Female,Information Technology,1,6,0,0,1
3,21,Male,Information Technology,0,8,0,1,1
4,22,Male,Mechanical,0,8,1,0,1


In [7]:
dataframe.isnull().sum() 

Age                  0
Gender               0
Stream               0
Internships          0
CGPA                 0
Hostel               0
HistoryOfBacklogs    0
PlacedOrNot          0
dtype: int64

In [8]:
print(dataframe.duplicated().sum())   
dataframe.drop_duplicates(inplace=True)  

1829


In [9]:
figure = px.scatter(dataframe, x="CGPA", y="Internships", color="PlacedOrNot",  
                 hover_data=['CGPA'])  
figure.show()  

In [10]:
px.histogram(dataframe, x='PlacedOrNot', color='PlacedOrNot', barmode='group') 

In [11]:
figure = px.pie(dataframe, values=dataframe['PlacedOrNot'].value_counts().values, names=dataframe['PlacedOrNot'].value_counts().index, title='Placed Vs Not Placed')  
figure.show() 

In [12]:
print("Max Internships Done by the Placed Student: ",dataframe[(dataframe['Internships'] == dataframe['Internships'].max()) & (dataframe['PlacedOrNot']==1)]['Internships'].values[0])  
print("No of students who did max Internships and are placed: ",dataframe[(dataframe['Internships'] == dataframe['Internships'].max()) & (dataframe['PlacedOrNot']==1)]['Internships'].value_counts().values[0])  
  
print("Min Internships Done by the Placed Person: ",dataframe[(dataframe['Internships'] == dataframe['Internships'].min()) & (dataframe['PlacedOrNot']==1)]['Internships'].values[0])  
print("No of students who did min Internships and are placed: ",dataframe[(dataframe['Internships'] == dataframe['Internships'].min()) & (dataframe['PlacedOrNot']==1)]['Internships'].value_counts().values[0])  

Max Internships Done by the Placed Student:  3
No of students who did max Internships and are placed:  35
Min Internships Done by the Placed Person:  0
No of students who did min Internships and are placed:  232


In [13]:
print("Max CGPA of Placed Student: ",dataframe[(dataframe['CGPA'] == dataframe['CGPA'].max()) & (dataframe['PlacedOrNot']==1)]['CGPA'].values[0])  
print("No of students has max CGPA and are placed: ",dataframe[(dataframe['CGPA'] == dataframe['CGPA'].max()) & (dataframe['PlacedOrNot']==1)]['CGPA'].value_counts().values[0])  
  
print("Min CGPA of Placed Person: ",dataframe[(dataframe['CGPA'] == dataframe['CGPA'].min()) & (dataframe['PlacedOrNot']==1)]['CGPA'].values[0])  
print("No of students has min CGPA and are placed: ",dataframe[(dataframe['CGPA'] == dataframe['CGPA'].min()) & (dataframe['PlacedOrNot']==1)]['CGPA'].value_counts().values[0])  

Max CGPA of Placed Student:  9
No of students has max CGPA and are placed:  99
Min CGPA of Placed Person:  5
No of students has min CGPA and are placed:  5


In [14]:
dataframe['Gender'] = dataframe['Gender'].map({'Male': 1, 'Female': 0})  

In [15]:
dataframe['Stream'].unique() 

array(['Electronics And Communication', 'Computer Science',
       'Information Technology', 'Mechanical', 'Electrical', 'Civil'],
      dtype=object)

In [16]:
dataframe['Stream'] = dataframe['Stream'].map({'Electronics And Communication': 1,  
                                 'Computer Science': 2,  
                                'Information Technology': 3,  
                                'Mechanical':4,  
                                'Electrical':5,  
                                'Civil':6})  

In [17]:
X = dataframe.iloc[:,0:7]  
y = dataframe.iloc[:,-1]  
X  

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs
0,22,1,1,1,8,1,1
1,21,0,2,0,7,1,1
2,22,0,3,1,6,0,0
3,21,1,3,0,8,0,1
4,22,1,4,0,8,1,0
...,...,...,...,...,...,...,...
2946,23,1,3,1,7,1,1
2952,23,1,4,0,8,1,0
2954,23,0,2,1,8,0,1
2958,23,1,2,0,6,0,1


In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)  

In [19]:
scaler = StandardScaler()  
X_train_scale = scaler.fit_transform(X_train)  
X_test_scale = scaler.transform(X_test) 

In [20]:
from sklearn.ensemble import RandomForestClassifier  
  
classifier = RandomForestClassifier(max_depth=10, random_state=0)  
  
classifier.fit(X_train,y_train)  
y_pred = classifier.predict(X_test)  
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))  
scores = cross_val_score(classifier, X_train, y_train, cv=10)  
print("Without Scaling and With CV: ",scores.mean())  
  
# With Scaling  
classifier.fit(X_train_scale,y_train)  
y_pred = classifier.predict(X_test_scale)  
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred))  
scores = cross_val_score(classifier, X_train_scale, y_train, cv=10)  
print("With Scaling and With CV: ",scores.mean())

Without Scaling and CV:  0.7686170212765957
Without Scaling and With CV:  0.7543574846206426
With Scaling and Without CV:  0.7659574468085106
With Scaling and With CV:  0.7543574846206424


In [21]:
 
classifier = RandomForestClassifier(max_depth=10, random_state=0)  
classifier.fit(X_train,y_train)  
y_pred = classifier.predict(X_test)  
print("Without CV: ",accuracy_score(y_test,y_pred))  
scores = cross_val_score(classifier, X_train, y_train, cv=10)  
print("With CV: ",scores.mean())  
print("Precision Score: ", precision_score(y_test, y_pred))  
print("Recall Score: ", recall_score(y_test, y_pred))  
print("F1 Score: ", f1_score(y_test, y_pred)) 

Without CV:  0.7686170212765957
With CV:  0.7543574846206426
Precision Score:  0.8333333333333334
Recall Score:  0.7709251101321586
F1 Score:  0.8009153318077803


In [22]:
param_grid = {  
    'bootstrap': [False,True],  
    'max_depth': [5,8,10, 20],  
    'max_features': [3, 4, 5, None],  
    'min_samples_split': [2, 10, 12],  
    'n_estimators': [100, 200, 300]  
}  
  
rfclassifier = RandomForestClassifier()  
  
classifier = GridSearchCV(estimator = rfclassifier, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 1)  
  
classifier.fit(X_train,y_train)  
y_pred = classifier.predict(X_test)  
print("Accuracy: ",accuracy_score(y_test,y_pred))  
print(classifier.best_params_)  
print(classifier.best_estimator_)  

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Accuracy:  0.8271276595744681
{'bootstrap': False, 'max_depth': 5, 'max_features': 5, 'min_samples_split': 12, 'n_estimators': 100}
RandomForestClassifier(bootstrap=False, max_depth=5, max_features=5,
                       min_samples_split=12)


In [23]:
classifier = RandomForestClassifier(bootstrap=False, max_depth=5,max_features=None,  
                             min_samples_split=2,  
                             n_estimators=100, random_state=0)  
classifier.fit(X_train,y_train)  
y_pred = classifier.predict(X_test)  
print("Without CV: ",accuracy_score(y_test,y_pred))  
scores = cross_val_score(classifier, X_train, y_train, cv=10)  
print("With CV: ",scores.mean())  
print("Precision Score: ", precision_score(y_test, y_pred))  
print("Recall Score: ", recall_score(y_test, y_pred))  
print("F1 Score: ", f1_score(y_test, y_pred))

Without CV:  0.8271276595744681
With CV:  0.814747095010253
Precision Score:  0.945054945054945
Recall Score:  0.7577092511013216
F1 Score:  0.841075794621027


In [24]:
print("With Scaling and Without CV: ",accuracy_score(y_test,y_pred)) 

With Scaling and Without CV:  0.8271276595744681


In [25]:
dataframe.head()

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,1,1,1,8,1,1,1
1,21,0,2,0,7,1,1,1
2,22,0,3,1,6,0,0,1
3,21,1,3,0,8,0,1,1
4,22,1,4,0,8,1,0,1


In [26]:
if(classifier.predict([[22,1,1,1,8,1,1]])):
    print("You will be placed and it is suggested that try for higher package")
else:
    print("You will not be placed focus on internships and trainings")    


You will be placed and it is suggested that try for higher package



X does not have valid feature names, but RandomForestClassifier was fitted with feature names



In [27]:
import pickle

model1=open('model1.pkl','wb')
pickle.dump(classifier,model1)
model1.close()