https://www.kaggle.com/benroshan/factors-affecting-campus-placement

In [67]:
from IPython.display import Image
import numpy as np
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_boston
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, LeavePOut, ShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
%matplotlib inline 
sns.set(style="ticks")

In [74]:
data = pd.read_csv("data/Placement_Data_Full_Class.csv",sep=',')
data.dtypes

sl_no               int64
gender             object
ssc_p             float64
ssc_b              object
hsc_p             float64
hsc_b              object
hsc_s              object
degree_p          float64
degree_t           object
workex             object
etest_p           float64
specialisation     object
mba_p             float64
status             object
salary            float64
dtype: object

In [75]:
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [76]:
cleanup_nums = {"gender":     {"M": 0, "F": 1},
                "hsc_b": {"Central": 0, "Others": 1},
                "ssc_b": {"Central": 0, "Others": 1},
                "status": {"Not Placed": 0, "Placed": 1},
                "workex": {"No": 0, "Yes": 1}
               }
data = data.replace(cleanup_nums)
data["hsc_s"] = data["hsc_s"].astype('category')
data["degree_t"] = data["degree_t"].astype('category')
data["specialisation"] = data["specialisation"].astype('category')
data["hsc_s_cat"] = data["hsc_s"].cat.codes
data["specialisation_cat"] = data["specialisation"].cat.codes
data["degree_t_cat"] = data["degree_t"].cat.codes
data.drop(["hsc_s", "degree_t", "specialisation"], axis=1, inplace = True)

imr = SimpleImputer(missing_values=np.nan, strategy='median')
imr = imr.fit(data[['salary']])
data['salary'] = imr.transform(data[['salary']])
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,mba_p,status,salary,hsc_s_cat,specialisation_cat,degree_t_cat
0,1,0,67.0,1,91.0,1,58.0,0,55.0,58.8,1,270000.0,1,1,2
1,2,0,79.33,0,78.33,1,77.48,1,86.5,66.28,1,200000.0,2,0,2
2,3,0,65.0,0,68.0,0,64.0,0,75.0,57.8,1,250000.0,0,0,0
3,4,0,56.0,0,52.0,0,52.0,0,66.0,59.43,0,265000.0,2,1,2
4,5,0,85.8,0,73.6,0,73.3,0,96.8,55.5,1,425000.0,1,0,0


In [110]:
data.corr()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,mba_p,status,salary,hsc_s_cat,specialisation_cat,degree_t_cat
sl_no,1.0,-0.074306,-0.078155,0.027214,-0.085711,0.116887,-0.088281,0.059151,0.063636,0.022327,-0.026859,0.04727,0.009955,0.04663,0.025651
gender,-0.074306,1.0,0.068969,-0.019429,0.021334,-0.065945,0.173217,-0.085153,-0.084294,0.300531,-0.09067,-0.140557,-0.071827,0.10616,-0.061345
ssc_p,-0.078155,0.068969,1.0,0.116194,0.511472,0.066996,0.538404,0.175675,0.261993,0.388478,0.607889,0.108669,0.236364,-0.172536,0.205896
ssc_b,0.027214,-0.019429,0.116194,1.0,-0.137013,0.605883,0.03807,-0.040744,-0.018991,0.08312,0.037297,0.009793,0.050919,-0.051565,0.100863
hsc_p,-0.085711,0.021334,0.511472,-0.137013,1.0,-0.019548,0.434206,0.141025,0.245113,0.354823,0.491228,0.122921,-0.164091,-0.24163,-0.08645
hsc_b,0.116887,-0.065945,0.066996,0.605883,-0.019548,1.0,0.067229,0.038357,0.039108,0.090201,0.016945,-0.003807,0.152227,0.002232,0.05796
degree_p,-0.088281,0.173217,0.538404,0.03807,0.434206,0.067229,1.0,0.122648,0.22447,0.402364,0.479861,0.053352,0.137276,-0.218286,0.079317
workex,0.059151,-0.085153,0.175675,-0.040744,0.141025,0.038357,0.122648,1.0,0.056735,0.168811,0.27606,0.156035,0.007856,-0.191174,0.105816
etest_p,0.063636,-0.084294,0.261993,-0.018991,0.245113,0.039108,0.22447,0.056735,1.0,0.218055,0.127639,0.169233,0.075643,-0.236315,0.011509
mba_p,0.022327,0.300531,0.388478,0.08312,0.354823,0.090201,0.402364,0.168811,0.218055,1.0,0.076922,0.155673,0.039345,-0.105728,0.116666


In [103]:
feature_cols = [
    'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'degree_p', 'workex', 'etest_p', 'mba_p', 'salary', 'hsc_s_cat', 'specialisation_cat', 'degree_t_cat'
]
data_X = data.loc[:,feature_cols]
data_Y = data.loc[:, 'status']
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(
    data_X, data_Y,test_size=0.3, random_state=360)

In [104]:
cl1_1 = KNeighborsRegressor(n_neighbors=5)
cl1_1.fit(data_X_train, data_y_train)
target1_0 = cl1_1.predict(data_X_train)
target1_1 = cl1_1.predict(data_X_test)
r2_score(data_y_train, target1_0), r2_score(data_y_test, target1_1)

(0.9272580319256415, 0.7387706855791962)

In [105]:
scores = cross_val_score(KNeighborsRegressor(n_neighbors=5), 
                         data_X, data_Y, 
                         cv=4)
scores, np.mean(scores)

(array([0.89517647, 0.93442857, 0.97      , 0.85172619]), 0.9128328081232493)

In [106]:
grid = GridSearchCV(estimator = KNeighborsRegressor(), param_grid={'n_neighbors': range(1,50,1)}, cv=RepeatedKFold(n_splits=3, n_repeats=3),scoring="r2")
grid.fit(data_X,data_Y)
grid.best_score_ , grid.best_params_,grid.best_estimator_

(0.8884111166822057, {'n_neighbors': 4}, KNeighborsRegressor(n_neighbors=4))

In [107]:
grid.best_estimator_.fit(data_X_train, data_y_train)
target2_0 = grid.best_estimator_.predict(data_X_train)
target2_1 = grid.best_estimator_.predict(data_X_test)
r2_score(data_y_train, target2_0), r2_score(data_y_test, target2_1)

(0.9374873711860983, 0.7502955082742316)

In [109]:
scores = cross_val_score(grid.best_estimator_, data_X, data_Y, cv=RepeatedKFold(n_splits=3, n_repeats=3))
print("%0.2f r^2 with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.88 r^2 with a standard deviation of 0.05
