In [None]:
import joblib 
import numpy as np
import pandas as pd
from sklearn.svm import SVC,SVR
from sklearn.pipeline import make_pipeline 
from sklearn.metrics import confusion_matrix,mean_squared_error
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder  ,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

In [None]:
data = pd.read_csv('./Placement_Data_Full_Class.csv').drop(columns = ['sl_no', 'ssc_b','hsc_b','specialisation','mba_p'])
# data.head()

Unnamed: 0,gender,ssc_p,hsc_p,hsc_s,degree_p,degree_t,workex,etest_p,status,salary
0,M,67.0,91.0,Commerce,58.0,Sci&Tech,No,55.0,Placed,270000.0
1,M,79.33,78.33,Science,77.48,Sci&Tech,Yes,86.5,Placed,200000.0
2,M,65.0,68.0,Arts,64.0,Comm&Mgmt,No,75.0,Placed,250000.0
3,M,56.0,52.0,Science,52.0,Sci&Tech,No,66.0,Not Placed,
4,M,85.8,73.6,Commerce,73.3,Comm&Mgmt,No,96.8,Placed,425000.0


In [None]:
print('Categories of various parameters :-')
print('_'*64)
print()
print('1. Gender :',', '.join(data['gender'].unique().tolist()))
print('2. Under-graduate stream of degree: ',', '.join(data['degree_t'].unique().tolist()))
print('3. HSC Board stream of Education: ',', '.join(data['hsc_s'].unique().tolist()))
print('4. Salary range of placed candidates: ',data['salary'].min(),'₹ -',data['salary'].max(),'₹')

Categories of various parameters :-
________________________________________________________________

1. Gender : M, F
2. Under-graduate stream of degree:  Sci&Tech, Comm&Mgmt, Others
3. HSC Board stream of Education:  Commerce, Science, Arts
4. Salary range of placed candidates:  200000.0 ₹ - 940000.0 ₹


In [None]:
data['gender'].replace(['F','M'],[0,1],inplace=True)
data['workex'].replace(['No','Yes'],[0,1],inplace=True)
data['status'].replace(['Not Placed','Placed'],[0,1],inplace=True)

In [None]:
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = label_encoder.fit_transform(np.array(data['degree_t']))
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = pd.DataFrame(onehot_encoder.fit_transform(integer_encoded),columns=['Comm&Mgmt','Others','Sci&Tech'])

data = pd.concat([data, onehot_encoded], axis=1)

label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = label_encoder.fit_transform(np.array(data['hsc_s']))
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = pd.DataFrame(onehot_encoder.fit_transform(integer_encoded),columns=['Arts','Commerce','Science'])

data = pd.concat([data, onehot_encoded], axis=1)

data.drop(['hsc_s','degree_t'],axis=1,inplace=True)
# data.head()

Unnamed: 0,gender,ssc_p,hsc_p,degree_p,workex,etest_p,status,salary,Comm&Mgmt,Others,Sci&Tech,Arts,Commerce,Science
0,1,67.0,91.0,58.0,0,55.0,1,270000.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1,79.33,78.33,77.48,1,86.5,1,200000.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1,65.0,68.0,64.0,0,75.0,1,250000.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1,56.0,52.0,52.0,0,66.0,0,,0.0,0.0,1.0,0.0,0.0,1.0
4,1,85.8,73.6,73.3,0,96.8,1,425000.0,1.0,0.0,0.0,0.0,1.0,0.0


In [None]:
data['salary'].fillna(0, inplace=True)
y1 = data['status']
y2 = data['salary']
data.drop(columns = ['status','salary'], inplace=True)

In [None]:
X_train, X_test, y1_train, y1_test = train_test_split(data, y1, test_size=0.2, random_state=42) 

In [None]:
rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train,y1_train)
y1_pred = rfc.predict(X_test)
confusion_matrix(y1_test,y1_pred)

array([[ 4,  8],
       [ 2, 29]])

In [None]:
params = {
    'n_estimators' : [100,200,300,400,500,600,700,800,900,1000] ,
    'max_depth' : [4,5,6,7,8], 
    'min_samples_split' : [2,5,10], 
    'min_samples_leaf' : [1,2,4], 
    'max_features' : [2,3,4],
}

grid_search = GridSearchCV(estimator = rfc, param_grid = params, cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y1_train)

print(grid_search.best_params_)

Fitting 5 folds for each of 1350 candidates, totalling 6750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 2588 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done 3277 tasks      | elapsed: 42.1min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 52.2min
[Parallel(n_jobs=-1)]: Done 4897 tasks      | elapsed: 63.1min
[Parallel(n_jobs=-1)]: Done 5828 tasks      | elapsed: 75.1min
[Parallel(n_jobs=-1)]: Done 6750 out of 6750 | elapsed: 87.2min finished


{'max_depth': 5, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}


In [None]:
rfc = RandomForestClassifier(max_depth= 5, max_features=2, min_samples_leaf= 1, min_samples_split= 5, n_estimators= 500)
rfc.fit(X_train,y1_train)
y1_pred = rfc.predict(X_test)
confusion_matrix(y1_test,y1_pred)

array([[ 4,  8],
       [ 2, 29]])

In [None]:
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc.fit(X_train, y1_train)
y1_pred2 = svc.predict(X_test)
confusion_matrix(y1_test,y1_pred2)

array([[ 6,  6],
       [ 2, 29]])

In [None]:
lr = make_pipeline(StandardScaler(), LogisticRegression())
lr.fit(X_train, y1_train)
y1_pred3 = svc.predict(X_test)
confusion_matrix(y1_test,y1_pred3)

array([[ 6,  6],
       [ 2, 29]])

In [None]:
joblib.dump(svc, 'svc.pkl') 
svc = joblib.load('svc.pkl')  
svc.predict(X_test) 

array([1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
y2 = y2.fillna(0)
X_train, X_test, y2_train, y2_test = train_test_split(data, y2, test_size=0.2, random_state=42) 

In [None]:
lr = LinearRegression()
lr.fit(X_train,y2_train)
y_pred = lr.predict(X_test)
print('RMSE:', mean_squared_error(y2_test,y_pred)**0.5)

RMSE: 121280.37835382161


In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y2_train)
y_pred = rfr.predict(X_test)
print('RMSE:', mean_squared_error(y2_test,y_pred)**0.5)

RMSE: 134503.82051286922


In [None]:
svr = SVR()
svr.fit(X_train,y2_train)
y_pred = svr.predict(X_test)
print('RMSE:', mean_squared_error(y2_test,y_pred)**0.5)

RMSE: 152648.1123855088


In [None]:
a = np.array(X_test.iloc[0]).reshape(1,-1)
if svc.predict(a)==0:
  Salary = 0
else :
  Salary = lr.predict(a)
print('Predicted Salary:', Salary)

Predicted Salary: [193027.97822188]
