# Classification

## Read file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy.spatial
import warnings
import sklearn as sk
import operator
import numpy as np
import pickle

import sklearn.preprocessing as prep
import sklearn.pipeline as pl
import sklearn.metrics as mt
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

import sklearn.tree as tree
import sklearn.svm as sv
import sklearn.neighbors as nei
import sklearn.decomposition as dc
import sklearn.neural_network as nn

warnings.simplefilter("ignore")

# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

pd.__version__ #print which version of pandas you're using

'0.24.2'

In [2]:
df = pd.read_csv('numerical.csv')
df.head()

Unnamed: 0,EMPLOYMENT_END_DATE,TOTAL_WORKERS,NEW_EMPLOYMENT,CONTINUED_EMPLOYMENT,CHANGE_PREVIOUS_EMPLOYMENT,NEW_CONCURRENT_EMPLOYMENT,CHANGE_EMPLOYER,AMENDED_PETITION,FULL_TIME_POSITION,PREVAILING_WAGE,...,label,WAGE_LEVEL_1,WAGE_LEVEL_2,WAGE_LEVEL_3,WAGE_LEVEL_4,CBA,DBA,OES,SCA,OTHER_PW_SOURCE
0,19,1,1,0,0,0,0,0,1,59197.0,...,0,1,0,0,0,0,0,1,0,0
1,19,2,1,0,0,0,0,0,1,76502.0,...,0,0,1,0,0,0,0,1,0,0
2,20,1,1,0,0,0,0,0,1,90376.0,...,0,0,0,1,0,0,0,1,0,0
3,18,1,0,0,0,0,1,0,1,116605.0,...,0,0,0,1,0,0,0,1,0,0
4,18,1,1,0,0,0,0,0,1,59405.0,...,0,0,0,1,0,0,0,1,0,0


## Eliminate class imbalance

In [3]:
def balance_class(d, size=50000):
    gr = d.groupby('label')
    not_cert = pd.DataFrame()
    cert = pd.DataFrame()
    for name, data in gr:
        if name == 0:
            not_cert = data.sample(n = size)
        else:
            cert = data.sample(n = size)
    frames = [cert, not_cert]
    new_df = shuffle(pd.concat(frames, axis=0, sort=False, ignore_index=True))
    new_df = new_df.reset_index(drop=True)
    return new_df

# ensure the training data and testing data are different
df = shuffle(df)
bal_df = balance_class(df[:500000])
bal_df.head()

Unnamed: 0,EMPLOYMENT_END_DATE,TOTAL_WORKERS,NEW_EMPLOYMENT,CONTINUED_EMPLOYMENT,CHANGE_PREVIOUS_EMPLOYMENT,NEW_CONCURRENT_EMPLOYMENT,CHANGE_EMPLOYER,AMENDED_PETITION,FULL_TIME_POSITION,PREVAILING_WAGE,...,label,WAGE_LEVEL_1,WAGE_LEVEL_2,WAGE_LEVEL_3,WAGE_LEVEL_4,CBA,DBA,OES,SCA,OTHER_PW_SOURCE
0,18,1,0,1,0,0,0,0,1,41309.0,...,0,1,0,0,0,0,0,1,0,0
1,19,1,1,0,0,0,0,0,1,40810.0,...,0,1,0,0,0,0,0,1,0,0
2,19,1,0,0,0,0,1,0,1,113693.0,...,1,0,0,0,1,0,0,1,0,0
3,18,1,0,1,0,0,0,0,1,147784.0,...,0,0,0,0,1,0,0,1,0,0
4,21,1,1,0,0,0,0,0,1,28371.0,...,1,1,0,0,0,0,0,1,0,0


## Create test set

In [4]:
test_df = df[500000:]
test_set = test_df.sample(n=10000)
print(df.shape)
print(test_set.shape)
g = test_set.groupby('label')
for n, s in g:
    print(n, len(s))
test_labels = test_set['label']

(1092830, 26)
(10000, 26)
0 1221
1 8779


## Common functions

In [10]:
def split_label(d):
    labels = d['label']
    features = d.drop('label', axis=1)
    return labels, features

In [35]:
def best_param(feature, label, model, grid):
    # determine the best parameter
    #model_grid = ms.GridSearchCV(cv = 5, param_grid = grid, estimator = model)
    model_grid = ms.RandomizedSearchCV(cv = 2, param_distributions = grid, estimator = model, n_iter=3)
    model_grid = model_grid.fit(feature, label)
    print(model_grid.best_params_)
    
    # cross validation to evaluate best parameters
    ac = ms.cross_val_score(model_grid, feature, label, cv=5, scoring='accuracy')
    acc = np.array([x for x in ac if not np.isnan(x)])
    print("On Balanced Data")
    print("Cross Validation Accuracy = " + str(acc))
    print("Average Accuracy = " + str(acc.mean()))

    return model_grid.best_params_

In [5]:
def print_report(data, model):
    labels, features = split_label(data)
    pred = ms.cross_val_predict(model, features, labels, cv = 5)
    print(mt.classification_report(labels, pred))
    return pred

In [6]:
def store_model(clf, fname):
    final_model = clf
    filename = fname + '.sav'
    pickle.dump(final_model, open(filename, 'wb'))
    print('stored ' + fname)

In [9]:
predict_results = {}

def add_result(result, title):
    predict_results[title] = result

def store_results():
    predict_results['label'] = list(test_labels)
    results = pd.DataFrame(predict_results, index=None)
    results.to_csv('model_results.csv', index=False)
    
def read_results():
    results = pd.read_csv('model_results.csv')
    return results

def edit_results(results, col, name):
    results[name] = col
    return result

def restore_results(results):
    results.to_csv('model_results.csv', index=False)

## Decision Tree

In [8]:
def decision_tree(data):
    label, feature = split_label(data)
    dt = tree.DecisionTreeClassifier()
    grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': range(10, 50, 10),
        'min_samples_leaf': range(1, 10, 4),
        'min_samples_split': range(2, 11, 4),
    }
    param = best_param(feature, label, dt, grid)
    dt_clf = tree.DecisionTreeClassifier(criterion=param['criterion'],
                                        max_depth=param['max_depth'],
                                        min_samples_leaf=param['min_samples_leaf'],
                                        min_samples_split=param['min_samples_split'])
    return dt_clf

In [12]:
dt = decision_tree(bal_df)

{'min_samples_split': 2, 'min_samples_leaf': 9, 'max_depth': 30, 'criterion': 'gini'}
On Balanced Data
Cross Validation Accuracy = [0.635  0.6    0.6135 0.6055 0.644 ]
Average Accuracy = 0.6196


In [12]:
label, feature = split_label(test_set)
dt_predict_result = print_report(test_set, dt)

              precision    recall  f1-score   support

           0       0.74      0.28      0.40      1221
           1       0.91      0.99      0.95      8779

    accuracy                           0.90     10000
   macro avg       0.83      0.63      0.67     10000
weighted avg       0.89      0.90      0.88     10000



In [13]:
store_model(dt, 'decision_tree')
print('model stored')
add_result(dt_predict_result, 'decision tree')
print(predict_results)

stored decision_tree
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1])}


## Random Forest

In [13]:
import sklearn.ensemble as en

def random_forest(data):
    label, feature = split_label(data)
    rf = en.RandomForestClassifier()
    grid = {
        'n_estimators': range(10, 50, 10),
        'criterion': ['gini', 'entropy'],
        'max_depth': range(10, 50, 10),
        'min_samples_leaf': range(1, 10, 4),
        'min_samples_split': range(2, 11, 4),
    }
    param = best_param(feature, label, rf, grid)
    rf_clf = en.RandomForestClassifier(n_estimators=param['n_estimators'],
                                        criterion=param['criterion'],
                                        max_depth=param['max_depth'],
                                        min_samples_leaf=param['min_samples_leaf'],
                                        min_samples_split=param['min_samples_split'])
    return rf_clf

In [14]:
rf = random_forest(bal_df)

{'n_estimators': 10, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 20, 'criterion': 'entropy'}
On Balanced Data
Cross Validation Accuracy = [0.6465 0.661  0.644  0.622  0.662 ]
Average Accuracy = 0.6471


In [15]:
label, feature = split_label(test_set)
rf_predict_result = print_report(test_set, rf)

{'n_estimators': 30, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_depth': 30, 'criterion': 'gini'}
              precision    recall  f1-score   support

           0       0.98      0.25      0.39      1221
           1       0.91      1.00      0.95      8779

    accuracy                           0.91     10000
   macro avg       0.94      0.62      0.67     10000
weighted avg       0.91      0.91      0.88     10000



In [16]:
store_model(rf, 'random_forest')
print('model stored')
add_result(rf_predict_result, 'random forest')
print(predict_results)

stored random_forest
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1]), 'random forest': array([1, 1, 1, ..., 1, 1, 1])}


## AdaBoost

In [16]:
import sklearn.ensemble as en

def ada_boost(data):
    label, feature = split_label(data)
    ada = en.AdaBoostClassifier()
    grid = {
        'n_estimators': range(10, 50, 10),
        'learning_rate': [0.5, 1.0, 1.5],
        'algorithm': ['SAMME']
    }
    param = best_param(feature, label, ada, grid)
    ada_clf = en.AdaBoostClassifier(n_estimators=param['n_estimators'],
                                        learning_rate=param['learning_rate'],
                                        algorithm=param['algorithm'])
    ada_clf.fit(feature, label)
    return ada_clf

In [17]:
ada = ada_boost(bal_df)

{'n_estimators': 10, 'learning_rate': 1.5, 'algorithm': 'SAMME'}
On Balanced Data
Cross Validation Accuracy = [0.647  0.649  0.6275 0.6255 0.655 ]
Average Accuracy = 0.6407999999999999


In [19]:
ada_predict_result = print_report(test_set, ada)

              precision    recall  f1-score   support

           0       0.98      0.25      0.39      1221
           1       0.91      1.00      0.95      8779

    accuracy                           0.91     10000
   macro avg       0.94      0.62      0.67     10000
weighted avg       0.91      0.91      0.88     10000



In [20]:
store_model(ada, 'ada_boost')
print('model stored')
add_result(ada_predict_result, 'ada boost')
print(predict_results)

stored ada_boost
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1]), 'random forest': array([1, 1, 1, ..., 1, 1, 1]), 'ada boost': array([1, 1, 1, ..., 1, 1, 1])}


## Nearest Neighbor

In [18]:
def nearest_neighbor(data):
    label, feature = split_label(data)
    pca = dc.PCA()
    scaler = prep.StandardScaler()
    nn = nei.KNeighborsClassifier(metric='euclidean')
    pipe = pl.Pipeline(steps = [('scaler', scaler), ('pca', pca), ('nn', nn)])
    grid = {
        'pca__n_components': range(8, 25, 8),
        'nn__n_neighbors': [5, 8, 11],
        'nn__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'nn__leaf_size': [20, 30, 40, 50]
    }
    param = best_param(feature, label, pipe, grid)
    nn_clf = nei.KNeighborsClassifier(n_neighbors=param['nn__n_neighbors'],
                                        algorithm=param['nn__algorithm'],
                                        leaf_size=param['nn__leaf_size'])
    pca_clf = dc.PCA(n_components=param['pca__n_components'])
    pipe_clf = pl.Pipeline(steps = [('scaler', scaler), ('pca', pca_clf), ('nn', nn_clf)])
    return pipe_clf

In [19]:
knn = nearest_neighbor(bal_df)

{'pca__n_components': 8, 'nn__n_neighbors': 8, 'nn__leaf_size': 40, 'nn__algorithm': 'ball_tree'}
On Balanced Data
Cross Validation Accuracy = [0.606  0.617  0.6215 0.6255 0.6285]
Average Accuracy = 0.6196999999999999


In [23]:
knn_predict_result = print_report(test_set, knn)

              precision    recall  f1-score   support

           0       0.68      0.18      0.29      1221
           1       0.90      0.99      0.94      8779

    accuracy                           0.89     10000
   macro avg       0.79      0.58      0.61     10000
weighted avg       0.87      0.89      0.86     10000



In [24]:
store_model(knn, 'knn')
print('model stored')
add_result(knn_predict_result, 'nearest neighbor')
print(predict_results)

stored knn
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1]), 'random forest': array([1, 1, 1, ..., 1, 1, 1]), 'ada boost': array([1, 1, 1, ..., 1, 1, 1]), 'nearest neighbor': array([0, 1, 1, ..., 1, 1, 1])}


## SVC

In [20]:
def svc(data):
    label, feature = split_label(data)
    scaler = prep.StandardScaler()
    pca = dc.PCA()
    svc = sv.SVC()
    pipe = pl.Pipeline(steps = [('scaler', scaler), ('pca', pca), ('svc', svc)])
    grid = {
        'pca__n_components': range(8, 25, 8),
        'svc__kernel': ['linear', 'rbf', 'poly']
    }
    param = best_param(feature, label, pipe, grid)
    sv_clf = sv.SVC(kernel=param['svc__kernel'])
    pca_clf = dc.PCA(n_components=param['pca__n_components'])
    pipe_clf = pl.Pipeline(steps = [('scaler', scaler), ('pca', pca_clf), ('svc', sv_clf)])
    return pipe_clf

In [21]:
svc = svc(bal_df)

{'svc__kernel': 'rbf', 'pca__n_components': 24}
On Balanced Data
Cross Validation Accuracy = [0.6435 0.6525 0.6405 0.6125 0.653 ]
Average Accuracy = 0.6403999999999999


In [27]:
sv_predict_result = print_report(test_set, svc)

              precision    recall  f1-score   support

           0       0.92      0.19      0.32      1221
           1       0.90      1.00      0.95      8779

    accuracy                           0.90     10000
   macro avg       0.91      0.60      0.63     10000
weighted avg       0.90      0.90      0.87     10000



In [28]:
store_model(svc, 'svc')
print('model stored')
add_result(sv_predict_result, 'svc')
print(predict_results)

stored svc
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1]), 'random forest': array([1, 1, 1, ..., 1, 1, 1]), 'ada boost': array([1, 1, 1, ..., 1, 1, 1]), 'nearest neighbor': array([0, 1, 1, ..., 1, 1, 1]), 'svc': array([1, 1, 1, ..., 1, 1, 1])}


## Neural Network

In [22]:
def neural_network(data):
    label, feature = split_label(data)
    scaler = prep.StandardScaler()
    mlp = nn.MLPClassifier()
    pipe = pl.Pipeline(steps = [('scaler', scaler), ('mlp', mlp)])
    grid = {
        'mlp__hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
        'mlp__activation': ['logistic', 'tanh', 'relu'],
        'mlp__solver': ['lbfgs', 'sgd', 'adam'],
        'mlp__learning_rate': ['constant', 'invscaling', 'adaptive']
    }
    param = best_param(feature, label, pipe, grid)
    mlp_clf = nn.MLPClassifier(hidden_layer_sizes=param['mlp__hidden_layer_sizes'],
                                        activation=param['mlp__activation'],
                                        solver=param['mlp__solver'],
                                        learning_rate=param['mlp__learning_rate'])
    pipe_clf = pl.Pipeline(steps = [('scaler', scaler), ('mlp', mlp_clf)])
    return pipe_clf

In [23]:
mlp = neural_network(bal_df)

{'mlp__solver': 'lbfgs', 'mlp__learning_rate': 'constant', 'mlp__hidden_layer_sizes': (30,), 'mlp__activation': 'relu'}
On Balanced Data
Cross Validation Accuracy = [0.6385 0.6405 0.63   0.6155 0.6625]
Average Accuracy = 0.6374


In [31]:
nn_predict_result = print_report(test_set, mlp)

              precision    recall  f1-score   support

           0       0.96      0.13      0.23      1221
           1       0.89      1.00      0.94      8779

    accuracy                           0.89     10000
   macro avg       0.92      0.56      0.59     10000
weighted avg       0.90      0.89      0.86     10000



In [32]:
store_model(mlp, 'neural_net')
print('model stored')
add_result(nn_predict_result, 'neural network')
print(predict_results)

stored neural_net
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1]), 'random forest': array([1, 1, 1, ..., 1, 1, 1]), 'ada boost': array([1, 1, 1, ..., 1, 1, 1]), 'nearest neighbor': array([0, 1, 1, ..., 1, 1, 1]), 'svc': array([1, 1, 1, ..., 1, 1, 1]), 'neural network': array([1, 1, 1, ..., 1, 1, 1])}


## Logistic Regression

In [24]:
import sklearn.linear_model as lm

def logistic_regression(data):
    label, feature = split_label(data)
    scaler = prep.StandardScaler()
    lr = lm.LogisticRegression()
    pipe = pl.Pipeline(steps = [('scaler', scaler), ('lr', lr)])
    grid = {
        'lr__penalty': ['l1', 'l2', 'elasticnet'],
        'lr__max_iter': [50, 100, 200],
        'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
    param = best_param(feature, label, pipe, grid)
    lr_clf = lm.LogisticRegression(penalty=param['lr__penalty'],
                                        max_iter=param['lr__max_iter'],
                                        solver=param['lr__solver'])
    
    pipe_clf = pl.Pipeline(steps = [('scaler', scaler), ('lr', lr_clf)])
    return pipe_clf

In [36]:
lr = logistic_regression(bal_df)

{'lr__solver': 'sag', 'lr__penalty': 'l2', 'lr__max_iter': 100}
On Balanced Data
Cross Validation Accuracy = [0.6365 0.6395 0.621  0.6105]
Average Accuracy = 0.626875


In [35]:
lr_predict_result = print_report(test_set, lr)

              precision    recall  f1-score   support

           0       0.94      0.12      0.21      1221
           1       0.89      1.00      0.94      8779

    accuracy                           0.89     10000
   macro avg       0.91      0.56      0.58     10000
weighted avg       0.90      0.89      0.85     10000



In [36]:
store_model(lr, 'lr')
print('model stored')
add_result(lr_predict_result, 'logistic regression')
print(predict_results)

stored lr
model stored
{'decision tree': array([1, 1, 1, ..., 1, 1, 1]), 'random forest': array([1, 1, 1, ..., 1, 1, 1]), 'ada boost': array([1, 1, 1, ..., 1, 1, 1]), 'nearest neighbor': array([0, 1, 1, ..., 1, 1, 1]), 'svc': array([1, 1, 1, ..., 1, 1, 1]), 'neural network': array([1, 1, 1, ..., 1, 1, 1]), 'logistic regression': array([1, 1, 1, ..., 1, 1, 1])}


In [37]:
for v in predict_results.items():
    print(v[0])
    print(len(v[1]))
store_results()

decision tree
10000
random forest
10000
ada boost
10000
nearest neighbor
10000
svc
10000
neural network
10000
logistic regression
10000
