In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

Load cleaned up version of dataset from 2012 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [3]:
df = pd.read_pickle('/Users/minse_chang/PycharmProjects/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [4]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']

In [5]:
# df = pd.concat([ones_df, zeros_df.sample(frac=0.1, random_state=99)])
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])


In [6]:
ones_df.head()
print(ones_df.shape)

(37310, 11)


In [7]:
zeros_df.head()
print(zeros_df.shape)

(2424938, 11)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [8]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Check value counts to verify stratify split

In [9]:
# print(y_train.value_counts())
# print(y_test.value_counts())
X_train.head()

Unnamed: 0,PREVAILING_WAGE,WAGE_LOWER_THAN_PW,FULL_TIME_POSITION_N,FULL_TIME_POSITION_Y,H1B_DEPENDENT_N,H1B_DEPENDENT_Y,NAICS_CODE_11111,NAICS_CODE_111110,NAICS_CODE_11113,NAICS_CODE_111140,...,VISA_CLASS_H-1B,VISA_CLASS_H-1B1 Chile,VISA_CLASS_H-1B1 Singapore,WAGE_UNIT_OF_PAY_Bi-Weekly,WAGE_UNIT_OF_PAY_Hour,WAGE_UNIT_OF_PAY_Month,WAGE_UNIT_OF_PAY_Week,WAGE_UNIT_OF_PAY_Year,WILLFUL_VIOLATOR_N,WILLFUL_VIOLATOR_Y
14456,6e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
234238,6.1e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
478549,7.4e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
394505,7.1e-05,False,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
564630,5.8e-05,False,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


Train DecisionTreeClassifier and predict using X_test

In [10]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=99,reg_alpha=0.06, 
colsample_bytree=0.8, 
n_estimators=1000, 
subsample=0.9, 
reg_lambda=0.07, 
max_depth=3, 
gamma=2, nthread=11)

xgb.fit(X_train, y_train)
# print(xgb.feature_importances_)
print(xgb.score(X_test, y_test))
xgb_pred = xgb.predict(X_test)
print(xgb_pred)

  if diff:


0.7382169222032936
[0 0 1 ... 0 0 0]


  if diff:


Train RandomForestClassifier and predict using X_test

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print("** XGBoost Classifier Stats (OVERALL testset)")
print(classification_report(y_test, xgb_pred))

** XGBoost Classifier Stats (OVERALL testset)
             precision    recall  f1-score   support

          0       0.69      0.80      0.74      6592
          1       0.80      0.68      0.73      7496

avg / total       0.75      0.74      0.74     14088



In [12]:
from sklearn.externals import joblib
joblib.dump(xgb, 'xgb.pkl', compress=9)

['xgb.pkl']

Output the trained model to a pickle file in case we need it in the future

In [13]:
from sklearn.externals import joblib
from xgboost import XGBClassifier
xgb = joblib.load('xgb.pkl')

ImportError: No module named xgboost

To load the pre-trained model, run the cell below.

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score, accuracy_score, confusion_matrix

print("** XGBoost Classifier Scores (OVERALL testset)")
print("Precision: %s"% precision_score(y_test, xgb_pred))
print("Recall: %s"% recall_score(y_test, xgb_pred))
print("Accuracy score: %s"% accuracy_score(y_test, xgb_pred))
print("F-1 score: %s"% f1_score(y_test, xgb_pred))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_test, xgb_pred, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_test, xgb_pred, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_test, xgb_pred))

** XGBoost Classifier Scores (OVERALL testset)
Precision: 0.7968506392266916
Recall: 0.6818303094983992
Accuracy score: 0.7382169222032936
F-1 score: 0.7348670021567218
F-beta score with beta=0.5: 0.7708433880308881
F-beta score with beta=0.2: 0.7917138329182704
Confusion Matrix: 
[[5289 1303]
 [2385 5111]]


Evaluate stats for the entire test set

In [15]:
import numpy as np
ones_index = y_test.nonzero()
x_ones = X_test.iloc[ones_index]
y_ones = np.take(y_test, ones_index)[0]

xgb_pred_oneonly = xgb.predict(x_ones)

  if diff:


In [16]:
print("** XGBoost Classifier Scores (Only rows with label DENIED->1)")
print("Precision: %s"% precision_score(y_ones, xgb_pred_oneonly))
print("Recall: %s"% recall_score(y_ones, xgb_pred_oneonly))
print("Accuracy score: %s"% accuracy_score(y_ones, xgb_pred_oneonly))
print("F-1 score: %s"% f1_score(y_ones, xgb_pred_oneonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_ones, xgb_pred_oneonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_ones, xgb_pred_oneonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_ones, xgb_pred_oneonly))

** XGBoost Classifier Scores (Only rows with label DENIED->1)
Precision: 1.0
Recall: 0.6818303094983992
Accuracy score: 0.6818303094983992
F-1 score: 0.8108193860553662
F-beta score with beta=0.5: 0.914638511095204
F-beta score with beta=0.2: 0.9823687264823946
Confusion Matrix: 
[[   0    0]
 [2385 5111]]


Evaluate stats for testset with label 1 (DENIED cases only)

In [17]:
zeros_index = np.arange(len(y_test))[(y_test==0)]

x_zeros = X_test.iloc[zeros_index]
y_zeros = np.take(y_test, zeros_index)
xgb_pred_zeroonly = xgb.predict(x_zeros)

  if diff:


In [18]:
print("** XGBoost Classifier Scores (Only rows with label CERTIFIED->0)")
print("Precision: %s"% precision_score(y_zeros, xgb_pred_zeroonly))
print("Recall: %s"% recall_score(y_zeros, xgb_pred_zeroonly))
print("Accuracy score: %s"% accuracy_score(y_zeros, xgb_pred_zeroonly))
print("F-1 score: %s"% f1_score(y_zeros, xgb_pred_zeroonly))
print("F-beta score with beta=0.5: %s"% fbeta_score(y_zeros, xgb_pred_zeroonly, beta=0.5))
print("F-beta score with beta=0.2: %s"% fbeta_score(y_zeros, xgb_pred_zeroonly, beta=0.2))
print("Confusion Matrix: \n%s"% confusion_matrix(y_zeros, xgb_pred_zeroonly))


** XGBoost Classifier Scores (Only rows with label CERTIFIED->0)
Precision: 0.0
Recall: 0.0
Accuracy score: 0.8023361650485437
F-1 score: 0.0
F-beta score with beta=0.5: 0.0
F-beta score with beta=0.2: 0.0
Confusion Matrix: 
[[5289 1303]
 [   0    0]]


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [19]:
import datetime
datetime.datetime.now()

datetime.datetime(2018, 12, 18, 9, 43, 2, 481290)

In [None]:
status_stats = df.groupby('CASE_STATUS').size().to_frame()
status_stats['percentage'] = status_stats[0] / status_stats[0].sum()
print(status_stats)