<a href="https://colab.research.google.com/github/yanaySG/XGBOOST_SALARIES/blob/main/xgboost_USsalaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
import xgboost

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
url = "https://raw.githubusercontent.com/yanaySG/XGBOOST_SALARIES/main/adult.data"


names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
         'marital_status', 'occupation', 'relationship', 'race', 'sex', 
         'capital_gain', 'capital_loss', 'hours-per-week', 'native_country',
         'label']
 


In [3]:
df_salaries = pd.read_csv(url, sep=', ', header=None)
df_salaries.columns = names

In [4]:
df_salaries.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours-per-week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [5]:
df_salaries.isnull().sum().sum()

0

In [6]:
df_salaries.shape

(32561, 15)

In [7]:
df_salaries.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours-per-week     int64
native_country    object
label             object
dtype: object

In [8]:
df_salaries.describe(include=['O'])

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,label
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [9]:
df_salaries.describe(include=[np.number])

Unnamed: 0,age,fnlwgt,education-num,capital_gain,capital_loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [10]:
df_salaries.label = np.where(df_salaries.label == '<=50K', 0, 1)
df_salaries.label.unique()

array([0, 1])

In [11]:
df_salaries['dif_capital'] = df_salaries.capital_gain - df_salaries.capital_loss

In [12]:
df_salaries['sex'] = np.where(df_salaries.sex == 'Female', 1, 0)
df_salaries.sex.unique()

array([0, 1])

In [13]:
categorical = list(set(df_salaries.columns) - set(df_salaries._get_numeric_data().columns))
print(categorical)
print(type(categorical))

['workclass', 'relationship', 'race', 'native_country', 'marital_status', 'education', 'occupation']
<class 'list'>


In [14]:
# change categorical data for its frequency in the each column
for col in categorical:
  df_salaries[col] = df_salaries.groupby(col)[col].transform('count')

In [15]:
df_salaries.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours-per-week,native_country,label,dif_capital
0,39,1298,77516,5355,13,10683,3770,8305,27816,0,2174,0,40,29170,0,2174
1,50,2541,83311,5355,13,14976,4066,13193,27816,0,0,0,13,29170,0,0
2,38,22696,215646,10501,9,4443,1370,8305,27816,0,0,0,40,29170,0,0
3,53,22696,234721,1175,7,14976,1370,13193,3124,0,0,0,40,29170,0,0
4,28,22696,338409,5355,13,14976,4140,1568,3124,1,0,0,40,95,0,0
5,37,22696,284582,1723,14,14976,4066,1568,27816,1,0,0,40,29170,0,0
6,49,22696,160187,514,5,418,3295,8305,3124,1,0,0,16,81,0,0
7,52,2541,209642,10501,9,14976,4066,13193,27816,0,0,0,45,29170,1,0
8,31,22696,45781,1723,14,10683,4140,8305,27816,1,14084,0,50,29170,1,14084
9,42,22696,159449,5355,13,14976,4066,13193,27816,0,5178,0,40,29170,1,5178


In [16]:
X = df_salaries.loc[:,df_salaries.columns != 'label']
y = df_salaries.loc[:, 'label']

In [17]:
X.shape, y.shape, df_salaries.shape

((32561, 15), (32561,), (32561, 16))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.75, random_state=1)

In [19]:
 xgb = xgboost.XGBClassifier()

In [20]:
parameters = {'nthreads' : [1],                          # number of threads 
              'objective': ['binary:logistic'],          # logistic clasification
              'learnig rate': [0.1, 0.5, 0.9],              # tasa de aprendizaje
              'n_estimator' : [100, 200]                      # number of trees
              }

In [21]:

fit_params = {'verbose': False,
              'early_stopping_rounds': 10,          # si en 10 rondas no se mejora la función de pérdida se detiene el entrenamiento
              'eval_metric': 'logloss',             # función de pérdida
              'eval_set': [(X_test, y_test)]}         # conjunto en el que se va a optimizar la función de pérdida

          

In [22]:
clf = GridSearchCV(xgb, param_grid=parameters, cv=3, scoring='accuracy', verbose=3)

In [23]:
clf.fit(X_train, y_train, **fit_params)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END learnig rate=0.1, n_estimator=100, nthreads=1, objective=binary:logistic;, score=0.854 total time=   1.0s
[CV 2/3] END learnig rate=0.1, n_estimator=100, nthreads=1, objective=binary:logistic;, score=0.863 total time=   0.9s
[CV 3/3] END learnig rate=0.1, n_estimator=100, nthreads=1, objective=binary:logistic;, score=0.861 total time=   0.9s
[CV 1/3] END learnig rate=0.1, n_estimator=200, nthreads=1, objective=binary:logistic;, score=0.854 total time=   0.9s
[CV 2/3] END learnig rate=0.1, n_estimator=200, nthreads=1, objective=binary:logistic;, score=0.863 total time=   0.9s
[CV 3/3] END learnig rate=0.1, n_estimator=200, nthreads=1, objective=binary:logistic;, score=0.861 total time=   0.9s
[CV 1/3] END learnig rate=0.5, n_estimator=100, nthreads=1, objective=binary:logistic;, score=0.854 total time=   0.9s
[CV 2/3] END learnig rate=0.5, n_estimator=100, nthreads=1, objective=binary:logistic;, score=0.863 total t

GridSearchCV(cv=3, estimator=XGBClassifier(),
             param_grid={'learnig rate': [0.1, 0.5, 0.9],
                         'n_estimator': [100, 200], 'nthreads': [1],
                         'objective': ['binary:logistic']},
             scoring='accuracy', verbose=3)