Import libraries

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

training_data = pd.read_csv("census.csv")
testing_data = pd.read_csv("test_census.csv")
testing_data.head(n=5)

Unnamed: 0.1,Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0,21.0,Private,10th,6.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States
1,1,49.0,Private,Bachelors,13.0,Married-civ-spouse,Adm-clerical,Wife,White,Female,0.0,0.0,40.0,United-States
2,2,44.0,Self-emp-not-inc,Assoc-acdm,12.0,Married-civ-spouse,Other-service,Wife,White,Female,0.0,0.0,99.0,United-States
3,3,34.0,Private,Bachelors,13.0,Married-civ-spouse,Sales,Husband,White,Male,7298.0,0.0,46.0,United-States
4,4,24.0,Private,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States


Preprocessing training_data

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

income_raw = training_data['income']
features_raw = training_data.drop('income', axis = 1)
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])
# One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_log_minmax_transform)

# Encode the 'income_raw' data to numerical values
income_raw = income_raw.map({'<=50K':0, '>50K':1})

# Split the 'features' and 'income' data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(features_final, 
                                                    income_raw, 
                                                    test_size = 0.33, 
                                                    random_state = 0,
                                                     shuffle=True,
                                                     stratify=income_raw)

In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
clf = AdaBoostClassifier(random_state=42)
param_dist = {"n_estimators": list(range(10, 200)),
              "learning_rate": [0.001, 0.005, .01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 10, 20]}
# Run a randomized search over the hyperparameters
random_search = RandomizedSearchCV(clf, param_distributions=param_dist)

# Fit the model on the training data
random_search.fit(X_train, y_train)

# Make predictions on the test data
y_valid_pred = random_search.best_estimator_.predict(X_valid)

In [None]:
from sklearn.metrics import fbeta_score, accuracy_score, confusion_matrix
print(accuracy_score(y_valid, y_valid_pred))
print(fbeta_score(y_valid, y_valid_pred, beta=0.5))
print(confusion_matrix(y_valid, y_valid_pred))

In [None]:
best_clf = random_search.best_estimator_
best_clf

Testing

In [None]:
# Dealing with missing data
testing_data.fillna(testing_data.mean(), inplace=True)

features_raw_testing = testing_data.iloc[:, 1:]
skewed = ['capital-gain', 'capital-loss']
features_testing_log_transformed = pd.DataFrame(data = features_raw_testing)
features_testing_log_transformed[skewed] = features_raw_testing[skewed].apply(lambda x: np.log(x + 1))
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
features_testing_log_minmax_transform = pd.DataFrame(data = features_testing_log_transformed)
features_testing_log_minmax_transform[numerical] = scaler.fit_transform(features_testing_log_transformed[numerical])
# One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
features_testing_final = pd.get_dummies(features_testing_log_minmax_transform)
X_test = features_testing_final.values

In [None]:
y_pred = best_clf.predict(X_test)
id_test = testing_data.iloc[:, 0].values

In [None]:
arr = zip(id_test, y_pred)
import csv
with open('test_pred.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'income'])
    for _id, pred in arr:
        writer.writerow([_id, pred])

In [None]:
#TODO 
# DONE: stratifiedshufflesplit, randomsearchcv, confusionmatrix
# https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/
# https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
# https://urldefense.com/v3/__https://stackoverflow.com/questions/1922985/explaining-the-adaboost-algorithms-to-non-technical-people__;!!MjIf2fY!yadcftz89nXfWctVwnCYfjLO0gTyA2m5rQ4nVBwgyG3-JTWgb_icxAbk35_Vp1A$