# Income Predictions - Logistic Regression (Timothy Manolias)

### The following program predicts whether an individual makes less than or greater than 50K dollars a year, based on 1994 U.S. census data and a logistic regression model.

# <font color='red'>TEST 1</font>

## <font color='blue'>Imports Data and Libraries</font>

In [1]:
import numpy as np
import pandas as pd

# Loads train data
train_data = pd.read_csv("income_train.csv")

# Loads test data
test_data = pd.read_csv("income_test.csv")

## <font color='blue'>Data Preprocessing</font>

In [2]:
import warnings

# Drops unknown values
warnings.simplefilter(action='ignore', category=FutureWarning)
for col in test_data.columns:
    train_data.drop(train_data[train_data[col] == '?'].index, inplace = True)
    test_data.drop(test_data[test_data[col] == '?'].index, inplace = True)

In [None]:
# Converts income (dependent variable) to dummy variables

# train_data
train_dummies = pd.get_dummies(train_data_y)
if train_data_y['income'][1] not in [0, 1]:
    train_data_y = pd.concat((train_data_y, train_dummies['income_>50K']), axis=1)
    del train_data_y['income']
    train_data_y = train_data_y.rename(columns={"income_ >50K": "income"})

# test_data
test_dummies = pd.get_dummies(test_data_y)
if test_data_y['income'][1] not in [0, 1]:
    test_data_y = pd.concat((test_data_y, test_dummies['income_>50K']), axis=1)
    del test_data_y['income']
    test_data_y = test_data_y.rename(columns={"income_ >50K": "income"})        

## <font color='blue'>Data Visualization</font>

In [None]:
warnings.simplefilter("ignore", UserWarning)

numeric_features = data.select_dtypes(exclude=["object","bool"])
numeric_features = numeric_features.stack().reset_index().rename(columns = {"level_1":"Features", 0:"Value"})

g = sns.FacetGrid(data =numeric_features, col="Features",  col_wrap=5, sharex=False, sharey=False)
g = g.map(sns.distplot, "Value", color ='blue')

plt.subplots_adjust(top=0.9)
plt.suptitle("Histograms of various features")

### Splits data into train and  test

In [None]:
# Splits data into X and y

# train_data
train_data_X = train_data.drop(['income'], axis=1)
train_data_y = train_data[['income']]

# test_data
test_data_X = test_data.drop(['income'], axis=1)
test_data_y = test_data[['income']]

## <font color='blue'>Standardization and One-Hot Encoding</font>

#### Standardizes continuous features and converts categorical variables into dummy variables.

In [None]:
from sklearn.preprocessing import StandardScaler

# Gets continuous variables
contin_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
contin_train = train_data_X[contin_cols]
contin_test = test_data_X[contin_cols]

In [None]:
# Standardizes continuous features

sc = StandardScaler()
for col in contin_cols:
    # X_train
    temp_train = np.array(contin_train[col]).reshape(-1, 1)
    train_data_X[col] = sc.fit_transform(temp_train)
    
    # X_test
    temp_test = np.array(contin_test[col]).reshape(-1, 1)
    test_data_X[col] = sc.fit_transform(temp_test)

In [None]:
# Converts categorical variables into dummy variables

categ_cols = ['workclass', 'education', 'marital_status', 'occupation',
              'relationship', 'race', 'sex', 'native_country']

# X_train
categ_train = train_data_X[categ_cols]
categ_train = pd.get_dummies(categ_train)

# X_test
categ_test = test_data_X[categ_cols]
categ_test = pd.get_dummies(categ_test)

# Replaces old categorical columns with encoded columns
if categ_cols[0] in train_data_X.columns:
    # X_train
    train_data_X = train_data_X.drop(categ_cols, axis=1)
    train_data_X = pd.concat((train_data_X, categ_train), axis=1)
    
    # X_test
    test_data_X = test_data_X.drop(categ_cols, axis=1)
    test_data_X = pd.concat((test_data_X, categ_test), axis=1)

## <font color='blue'>Splits Data into Training and Validation Sets</font>

In [None]:
X_train = train_data_X.iloc[:2800,:]
y_train = train_data_y.iloc[:2800,:]
X_val = train_data_X.iloc[2800:,:]
y_val = train_data_y.iloc[2800:,:]

## <font color='blue'>Logistic Regression</font>

In [None]:
from sklearn.linear_model import LogisticRegression

# Creates and fits logistic regression model
mod = LogisticRegression(max_iter=1000)
mod.fit(X_train, y_train.values.ravel())

In [None]:
# Predicts output for validation set

y_pred = mod.predict(X_val)
y_val = np.array([i[0] for i in y_val.values.tolist()])

## <font color='blue'>Precision, Recall & Accuracy Functions</font>

#### Manually calculates precision, recall & accuracy to display performance of logistic regression model.

In [None]:
def precision(y, y_preds):
    """
    Return precision, which is TP/(TP+FP)
    """
    tp = 0
    fp = 0
    for i in range(len(y_preds)):
        if y_preds[i] == 1:
            if y[i] == 1:
                tp += 1
            else:
                fp += 1
                
    return tp / (tp+fp)

In [None]:
def recall(y, y_preds):
    """
    Return recall, which is TP/(TP+FN)
    """
    tp = 0
    fn = 0
    for i in range(len(y_preds)):
        if y_preds[i] == 1 and y[i] == 1:
            tp += 1
        elif y[i] == 1 and y_preds[i] == 0:
            fn += 1
                
    return tp / (tp+fn)

In [None]:
def accuracy(y, y_preds):
    """
    Return accuracy, which is (TP+TN)/(TP+FP+FN+TN)
    """
    tp, tn, fp, fn = (0, 0, 0, 0)
    
    for i in range(len(y_preds)):
        if y_preds[i] == 1 and y[i] == 1:
            tp += 1
        elif y_preds[i] == 1 and y[i] == 0:
            fp += 1
        elif y_preds[i] == 0 and y[i] == 1:
            fn += 1
        elif y_preds[i] == 0 and y[i] == 0:
            tn += 1
                
    return (tp + tn) / (tp + fp + fn + tn)

## <font color='blue'>Evaluates Performance of `y_pred` in `test_data`</font>

In [None]:
# Using the predictions to calculate accuracy, precision, recall

print(f'Q1. Precision: {precision(y_val, y_pred):.5f}')
print(f'Q2. Recall:    {recall(y_val, y_pred):.5f}')
print(f'Q3. Accuracy:  {accuracy(y_val, y_pred):.5f}')

### ROC Curve

In [None]:
from sklearn.metrics import roc_curve
import sklearn.metrics as metrics

y_probs = mod.predict_proba(X_val)[:,1]
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_probs, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## <font color='blue'>Tuning Hyperparameters of Logistic Regression Model</font>

In [None]:
def tune_logistic_model(c, p, X_train, y_train, y_val):
    """Prints precision, recall & accuracy values,
       based off logistic model's performance."""
    
    # Builds model
    mod = LogisticRegression(C=c, penalty=p, solver='liblinear').fit(X_train, y_train.values.ravel())

    # Predictions
    y_pred = mod.predict(X_val)

    # Prints accuracy, precision & recall
    print(f'Precision: {precision(y_val, y_pred):.5}')
    print(f'Recall:    {recall(y_val, y_pred):.5}')
    print(f'Accuracy:  {accuracy(y_val, y_pred):.5}')

#### Builds a logistic regression model with hyperparameter **'C' set to 0.1** and **penalty set to 'l1'**.

In [None]:
tune_logistic_model(0.1, 'l1', X_train, y_train, y_val)

#### Builds a logistic regression model with hyperparameter **'C' set to 0.5** and **penalty set to 'l1'**.

In [None]:
tune_logistic_model(0.5, 'l1', X_train, y_train, y_val)

#### Builds a logistic regression model with hyperparameter **'C' set to 0.1** and **penalty set to 'l2'**.

In [None]:
tune_logistic_model(0.1, 'l2', X_train, y_train, y_val)

#### Builds a logistic regression model with hyperparameter **'C' set to 0.5** and **penalty set to 'l2'**.

In [None]:
tune_logistic_model(0.5, 'l2', X_train, y_train, y_val)

### Best model out of the 4 listed above:

**Model 2**, with hyperparameter 'C' set to 0.5 and penalty set to 'l1', is the best based off its high accuracy. Although model 4 has the same accuracy as model 2, model 2 has a higher precision than model 4. Furthermore, the recall values of the two models are very close. Therefore, **model 2 is the best based off the accuracy and precision**.

## <font color='blue'>Test Set Predictions</font>

#### Makes predictions on `test_data` using a logistic regression model with the hyperparameters listed above.

In [None]:
# Adds one-hot encoded columns from training set that were not in test set
for i in train_data_X.columns:
    if i not in test_data.columns:
        test_data_X[i] = 0

# Sorts by columns
train_data_X = train_data_X.sort_index(axis=1)
test_data_X = test_data_X.sort_index(axis=1)

# Logistic regression
final_mod = LogisticRegression(C=0.5, penalty='l1', solver='liblinear').fit(train_data_X, train_data_y.values.ravel())

# Predictions on test_data
test_pred = final_mod.predict(test_data_X)
test_val = np.array([i[0] for i in test_data_y.values.tolist()])

In [None]:
# Using the predictions to calculate accuracy, precision, recall

print(f'Q1. Precision: {precision(test_val, test_pred):.5f}')
print(f'Q2. Recall:    {recall(test_val, test_pred):.5f}')
print(f'Q3. Accuracy:  {accuracy(test_val, test_pred):.5f}')

## <font color='blue'>Results</font>

#### After training the logistic regression model with `train_data` and testing numerous hyperparameters, the prediction results for `train_data` were the following:

**Precision:** 0.54599

**Recall:**    0.63835

**Accuracy:**  0.82203


#### Evaluating the logistic regression model with `test_data` yielded the following prediction results:

**Precision:** 0.60976

**Recall:**    0.04149

**Accuracy:**  0.80380



#### The test set predictions resulted in a higher precision and slightly less accuracy as compared to the train data predictions. However, the precision value was significantly lower for the test set as compared to the training set. This means that the logistic model yields a relatively small amount of true positives as compared to false negatives for the test dataset.