In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import PCA

Load cleaned up version of dataset from 2015 to 2018.  Drop geographic information about employer and worksite to reduce dimensionality

In [2]:
df = pd.read_pickle('/Users/minse_chang/PycharmProjects/Udacity_ML_Capstone/data/H1B_15-18_new.pickle')
df.drop(columns=['EMPLOYER_CITY','JOB_TITLE','EMPLOYER_NAME'], inplace=True)
df.drop(columns=['EMPLOYER_STATE','WORKSITE_STATE'], inplace=True)


In [3]:
df.head()

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,H1B_DEPENDENT,NAICS_CODE,PREVAILING_WAGE,PW_SOURCE,SOC_CODE,VISA_CLASS,WAGE_UNIT_OF_PAY,WILLFUL_VIOLATOR,WAGE_LOWER_THAN_PW
1,CERTIFIED,Y,N,611310,45727.334,Other,25-1032,H-1B,Year,N,False
3,CERTIFIED,Y,N,335314,70413.2662,OES,17-2072,H-1B,Year,N,False
4,CERTIFIED,Y,N,522294,103390.0783,OES,15-1131,H-1B,Year,N,False
6,CERTIFIED,Y,N,541511,142938.9944,Other,15-1132,H-1B,Year,N,False
7,CERTIFIED,Y,N,522294,68372.2865,OES,15-2031,H-1B,Year,N,False


Show first 5 lines of the dataset

In [4]:
ones_df = df[df['CASE_STATUS']=='DENIED']
zeros_df = df[df['CASE_STATUS']=='CERTIFIED']
df = pd.concat([ones_df, zeros_df.sample(frac=0.02, random_state=99)])


In [5]:
ones_df.head()
print(ones_df.shape)

(37310, 11)


In [6]:
zeros_df.head()
print(zeros_df.shape)

(1656369, 11)


We will use column 'CASE_STATUS' as our label.  DENIED will be 1 and CERTIFIED will be 0 after it gets incoded by label_encoder.

Normalize PREVAILING_WAGE using minmax scaler since it varies quite a bit.

In [7]:
scaler = MinMaxScaler()
X = df.drop(columns='CASE_STATUS')

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['CASE_STATUS'])

X[['PREVAILING_WAGE']] = scaler.fit_transform(X[['PREVAILING_WAGE']])
X = pd.get_dummies(X)

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

Check value counts to verify stratify split

In [8]:
n_components = 400
pca = PCA(n_components=n_components, whiten=True, svd_solver='randomized')
pca = pca.fit(X)

In [9]:
from time import time
t0 = time()
# eigenfaces = pca.components_.reshape((n_components, h, w))
X = pca.transform(X)

print("done in %0.3fs" % (time() - t0))

done in 17.083s


In [11]:
# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier()

# X_train_benchmark = X_train[['PREVAILING_WAGE', 'WAGE_RATE_OF_PAY_FROM', 'SOC_CODE','NAICS_CODE', 'PW_UNIT_OF_PAY']]
# X_test_benchmark = X_test[['PREVAILING_WAGE', 'WAGE_RATE_OF_PAY_FROM', 'SOC_CODE','NAICS_CODE', 'PW_UNIT_OF_PAY']]
# X_train_benchmark['LOW_WAGE'] = X_train_benchmark['WAGE_RATE_OF_PAY_FROM'] < X_train_benchmark['PREVAILING_WAGE']
# X_test_benchmark['LOW_WAGE'] = X_test_benchmark['WAGE_RATE_OF_PAY_FROM'] < X_test_benchmark['PREVAILING_WAGE']
# X_train_benchmark.drop(columns=['WAGE_RATE_OF_PAY_FROM'], inplace=True)
# X_test_benchmark.drop(columns=['WAGE_RATE_OF_PAY_FROM'], inplace=True)
# X_train_benchmark = pd.get_dummies(X_train_benchmark)
# X_test_benchmark = pd.get_dummies(X_test_benchmark)

# clf.fit(X_train_benchmark, y_train)
# print(clf.feature_importances_)
# clf.score(X_test_benchmark, y_test)

Train DecisionTreeClassifier and predict using X_test

Train RandomForestClassifier and predict using X_test

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# # from sklearn.feature_extraction.text import CountVectorizer

# rfc = RandomForestClassifier()
# # vec = CountVectorizer()
# # X_train_t = vec.fit_transform(X_train)

# # rfc.fit(X, y)

# params = {'n_estimators': list(range(10,300,40)),
# 'max_depth': list(range(10,100,10)),
# 'max_features': list(range(2,20,2)),
# # 'min_samples_leaf': list(range(3,10,2)),
# # 'min_samples_split': list(range(5,15,3)),
# #           'bootstrap': [True, False]
# }

# from time import time
# start = time()

# # run grid search
# n_iter_search = 50
# # grid_search = GridSearchCV(rfc, param_grid=params, cv=5)
# random_search = RandomizedSearchCV(rfc, param_distributions=params, scoring='accuracy',
#                                    n_iter=n_iter_search, cv=5)


# random_search.fit(X, y)
# endtime= time()

  from numpy.core.umath_tests import inner1d


In [12]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf')
params={
    'C':[1,10,100,1000], 
    'gamma':[1,0.1,0.001,0.0001],
}





from time import time
start = time()

# run grid search
n_iter_search = 7
# grid_search = GridSearchCV(svc, param_grid=params, cv=5)
random_search = RandomizedSearchCV(svc, param_distributions=params, scoring='accuracy',
                                   n_iter=n_iter_search, cv=5)


random_search.fit(X, y)
endtime= time()

Output the feature importance vector and sort by importance

In [14]:
import numpy as np
def report(results, n_top=30):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((endtime - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 170855.93 seconds for 7 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.719 (std: 0.024)
Parameters: {'C': 100, 'gamma': 0.001}

Model with rank: 2
Mean validation score: 0.718 (std: 0.025)
Parameters: {'C': 10, 'gamma': 0.001}

Model with rank: 3
Mean validation score: 0.718 (std: 0.022)
Parameters: {'C': 100, 'gamma': 1}

Model with rank: 4
Mean validation score: 0.717 (std: 0.022)
Parameters: {'C': 1000, 'gamma': 1}

Model with rank: 5
Mean validation score: 0.717 (std: 0.021)
Parameters: {'C': 1, 'gamma': 0.1}

Model with rank: 6
Mean validation score: 0.715 (std: 0.023)
Parameters: {'C': 1000, 'gamma': 0.1}

Model with rank: 7
Mean validation score: 0.714 (std: 0.026)
Parameters: {'C': 1000, 'gamma': 0.001}

