# Load the dataset

In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

df = pd.read_csv('Campus Recruitment.csv')

In [93]:
df.dtypes

sl_no               int64
gender             object
ssc_p             float64
ssc_b              object
hsc_p             float64
hsc_b              object
hsc_s              object
degree_p          float64
degree_t           object
workex             object
etest_p           float64
specialisation     object
mba_p             float64
status             object
salary            float64
dtype: object

In [94]:
df.drop(columns=['ssc_b', 'hsc_b'], inplace=True)

In [95]:
from sklearn.preprocessing import LabelEncoder

numeric_features = df._get_numeric_data().columns
cat_features = list(set(df.columns) - set(numeric_features))
le = LabelEncoder()
for col in cat_features:
    df[col] = le.fit_transform(df[col])

In [96]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,hsc_p,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,1,67.0,91.0,1,58.0,2,0,55.0,1,58.8,1,270000.0
1,2,1,79.33,78.33,2,77.48,2,1,86.5,0,66.28,1,200000.0
2,3,1,65.0,68.0,0,64.0,0,0,75.0,0,57.8,1,250000.0
3,4,1,56.0,52.0,2,52.0,2,0,66.0,1,59.43,0,
4,5,1,85.8,73.6,1,73.3,0,0,96.8,0,55.5,1,425000.0


### Split the data into train and test sets

In [97]:
X_train = df[df.salary.notna()].drop(columns='salary')
y_train = df.salary[df.salary.notna()]
X_test = df[df.salary.isna()].drop(columns='salary')

### Use XGBoost to predict the missing salary

In [98]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

parameters = {'objective':['reg:linear'],
              'learning_rate': [.01, 0.05, 0.1, 0.2, 0.3], #so called `eta` value
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 5, 10],
              'silent': [1],
              'n_estimators': [500, 600, 700],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [3, 4, 5],
                'reg_alpha': [1.1, 1.2, 1.3],
                'reg_lambda': [1.1, 1.2, 1.3],
                'subsample': [0.7, 0.8, 0.9]
             }

xgb_grid = GridSearchCV(XGBRegressor(),
                        parameters,
                        cv = 2,
                        n_jobs = 2,
                        verbose=True)

In [99]:
xgb_grid.fit(X_train,y_train)

Fitting 2 folds for each of 54675 candidates, totalling 109350 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 164 tasks      | elapsed:    5.7s
[Parallel(n_jobs=2)]: Done 764 tasks      | elapsed:   25.2s
[Parallel(n_jobs=2)]: Done 1764 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done 3164 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 4964 tasks      | elapsed:  2.9min
[Parallel(n_jobs=2)]: Done 7164 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done 9764 tasks      | elapsed:  5.8min
[Parallel(n_jobs=2)]: Done 12764 tasks      | elapsed:  7.7min
[Parallel(n_jobs=2)]: Done 16164 tasks      | elapsed:  9.7min
[Parallel(n_jobs=2)]: Done 19964 tasks      | elapsed: 12.0min
[Parallel(n_jobs=2)]: Done 24164 tasks      | elapsed: 14.5min


KeyboardInterrupt: 

In [None]:
print(xgb_grid.best_params_)
gbm = XGBRegressor(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

In [None]:
predictions = gbm.predict(X_test)

In [None]:
predictions