# Gaussian Classification Example

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
%matplotlib inline

### Open the file into a data frame

In [34]:
dat_income = pd.read_csv('income.csv')

In [35]:
dat_income

Unnamed: 0,age,workclass,education,edyears,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,<=50K
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,>50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,<=50K
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,<=50K


### Create a list of numeric features

In [36]:
numeric_features = ['age','edyears','hours-per-week']

In [37]:
dat_income[numeric_features]

Unnamed: 0,age,edyears,hours-per-week
0,39,13,40
1,50,13,13
2,38,9,40
3,53,7,40
4,28,13,40
...,...,...,...
32556,27,12,38
32557,40,9,40
32558,58,9,40
32559,22,9,20


### Get a correlation table on the numeric features

In [38]:
dat_income[numeric_features].corr()

Unnamed: 0,age,edyears,hours-per-week
age,1.0,0.036527,0.068756
edyears,0.036527,1.0,0.148123
hours-per-week,0.068756,0.148123,1.0


### Recode the target Variable (income) and rename as 'high-income'

In [39]:
dat_income = pd.concat([dat_income, pd.get_dummies(dat_income['income'], prefix='income', drop_first=True)], axis=1)
dat_income.rename(columns={'income_ >50K':'high-income'}, inplace=True)
dat_income.drop(['income'], inplace=True, axis=1)
dat_income

Unnamed: 0,age,workclass,education,edyears,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,high-income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,0
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,1
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,0
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,0


In [40]:
dat_income

Unnamed: 0,age,workclass,education,edyears,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,high-income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,38,United-States,0
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,40,United-States,1
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,40,United-States,0
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,20,United-States,0


### Get Correlations for all numeric values and the target

In [41]:
target_feature = ['high-income']
dat_income[numeric_features+target_feature].corr()

Unnamed: 0,age,edyears,hours-per-week,high-income
age,1.0,0.036527,0.068756,0.234037
edyears,0.036527,1.0,0.148123,0.335154
hours-per-week,0.068756,0.148123,1.0,0.229689
high-income,0.234037,0.335154,0.229689,1.0


### Aggregate numeric values by nominal

In [42]:
dat_income.groupby('workclass').mean()

  dat_income.groupby('workclass').mean()


Unnamed: 0_level_0,age,edyears,hours-per-week,high-income
workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
?,40.96024,9.260349,31.91939,0.104031
Federal-gov,42.590625,10.973958,41.379167,0.386458
Local-gov,41.751075,11.042045,40.9828,0.294792
Never-worked,20.571429,7.428571,28.428571,0.0
Private,36.797585,9.879714,40.267096,0.218673
Self-emp-inc,46.017025,11.137097,48.8181,0.557348
Self-emp-not-inc,44.969697,10.226289,44.421881,0.284927
State-gov,39.436055,11.375963,39.031587,0.271957
Without-pay,47.785714,9.071429,32.714286,0.0


In [43]:
dat_income.groupby('education').mean()

  dat_income.groupby('education').mean()


Unnamed: 0_level_0,age,edyears,hours-per-week,high-income
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10th,37.429796,6.0,37.052519,0.066452
11th,32.355745,7.0,33.925957,0.051064
12th,32.0,8.0,35.7806,0.076212
1st-4th,46.142857,2.0,38.255952,0.035714
5th-6th,42.885886,3.0,38.897898,0.048048
7th-8th,48.44582,4.0,39.366873,0.06192
9th,41.060311,5.0,38.044747,0.052529
Assoc-acdm,37.381443,12.0,40.504217,0.24836
Assoc-voc,38.553546,11.0,41.610709,0.261216
Bachelors,38.904949,13.0,42.614006,0.414753


In [44]:
dat_income.groupby('sex').mean()

  dat_income.groupby('sex').mean()


Unnamed: 0_level_0,age,edyears,hours-per-week,high-income
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,36.85823,10.035744,36.410361,0.109461
Male,39.433547,10.102891,42.428086,0.305737


### Recode 'sex' column as Male = 1; rename column to 'male'

In [45]:
#dat_income = pd.concat([dat_income, pd.get_dummies(dat_income['sex'], prefix='income', drop_first=True)], axis=1)

#sex_mapping = {'Female':0
#               ,'Male':1
#               }
#dat_income
#dat_income = dat_income.assign(sex = dat_income.sex.map(sex_mapping))
#dat_income.head(10)

#dat_income.rename(columns={'sex':'male'}, inplace=True)
#dat_income.head(10)

df_dummies = pd.get_dummies(dat_income, columns=['sex'])
dat_income = df_dummies.drop(['sex_ Female'], axis=1)
dat_income.rename(columns={'sex_ Male':'male'},inplace = True)
dat_income.head(10)

Unnamed: 0,age,workclass,education,edyears,marital-status,occupation,relationship,race,hours-per-week,native-country,high-income,male
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,40,United-States,0,1
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,13,United-States,0,1
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,40,United-States,0,1
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,40,United-States,0,1
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,40,Cuba,0,0
5,37,Private,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,40,United-States,0,0
6,49,Private,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,16,Jamaica,0,0
7,52,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,45,United-States,1,1
8,31,Private,Masters,14,Never-married,Prof-specialty,Not-in-family,White,50,United-States,1,0
9,42,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,40,United-States,1,1


### Create list of training features.  Then code x and Y as Training and Target

In [46]:
dat_income

Unnamed: 0,age,workclass,education,edyears,marital-status,occupation,relationship,race,hours-per-week,native-country,high-income,male
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,40,United-States,0,1
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,13,United-States,0,1
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,40,United-States,0,1
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,40,United-States,0,1
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,40,Cuba,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,38,United-States,0,0
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,40,United-States,1,1
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,40,United-States,0,0
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,20,United-States,0,1


In [47]:
training_features = ['age','edyears','hours-per-week','male']
X=dat_income[training_features]
y=dat_income[target_feature]

### Split training and testing sets, then validate distribution.

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=101)

In [49]:
y_train.mean()

high-income    0.241664
dtype: float64

In [50]:
y_test.mean()

high-income    0.238817
dtype: float64

### Using Linear Discriminant Analysis

In [56]:
# Note that we are using the full data set
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [57]:
# Evaluate the training model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#evaluate model
scores = cross_val_score(lda, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))

0.79890622875017


In [58]:
# Evaluate the testing model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#evaluate model
scores = cross_val_score(lda, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))

0.8094990449742997


In [59]:
# Predicting a class based on values
data_array_1 = [45,16,40,1]
data_array_2 = [45,12,40,1]

model.predict([data_array_1,data_array_2])

NameError: name 'model' is not defined

### Using Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [61]:
# Generate predictions based on the model
y_train_pred_log = log.predict(X_train)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_train, y_train_pred_log, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_income', 'low_income'],
                         columns=['predicted_high_income','predicted_low_income'])
print(confusion)
print()
print(classification_report(y_train, y_train_pred_log)) 

accuracy = accuracy_score(y_true=y_train, y_pred=y_train_pred_log)
accuracy.round(3)

             predicted_high_income  predicted_low_income
high_income                   2075                  3433
low_income                    1133                 16151

              precision    recall  f1-score   support

           0       0.82      0.93      0.88     17284
           1       0.65      0.38      0.48      5508

    accuracy                           0.80     22792
   macro avg       0.74      0.66      0.68     22792
weighted avg       0.78      0.80      0.78     22792



0.8

In [62]:
print('intercept: {}'.format(log.intercept_))
print('coefficients (log odds) (age,edyears,hours-per-week,male): {}'.format(log.coef_))
print('coefficients (odds ratio) (age,edyears,hours-per-week,male): {}'.format(np.exp(log.coef_)))


intercept: [-9.05338672]
coefficients (log odds) (age,edyears,hours-per-week,male): [[0.04511733 0.35136761 0.0352037  1.16801679]]
coefficients (odds ratio) (age,edyears,hours-per-week,male): [[1.04615059 1.42100961 1.03583069 3.21560907]]


### Check accuracy of Test Set

In [63]:
# Generate predictions based on the model
y_test_pred_log = log.predict(X_test)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_test, y_test_pred_log, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_income', 'low_income'],
                         columns=['predicted_high_income','predicted_low_income'])
print(confusion)
print()
print(classification_report(y_test, y_test_pred_log)) 

accuracy = accuracy_score(y_true=y_test, y_pred=y_test_pred_log)
accuracy.round(3)

             predicted_high_income  predicted_low_income
high_income                    923                  1410
low_income                     449                  6987

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      7436
           1       0.67      0.40      0.50      2333

    accuracy                           0.81      9769
   macro avg       0.75      0.67      0.69      9769
weighted avg       0.79      0.81      0.79      9769



0.81

### Using a Naive Bayes Classifier

In [64]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [65]:
# Generate predictions based on the model
y_train_pred_nb = nb.predict(X_train)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_train, y_train_pred_nb, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_income', 'low_income'],
                         columns=['predicted_high_income','predicted_low_income'])
print(confusion)
print()
print(classification_report(y_train, y_train_pred_nb)) 

accuracy = accuracy_score(y_true=y_train, y_pred=y_train_pred_nb)
accuracy.round(3)

             predicted_high_income  predicted_low_income
high_income                   2588                  2920
low_income                    1533                 15751

              precision    recall  f1-score   support

           0       0.84      0.91      0.88     17284
           1       0.63      0.47      0.54      5508

    accuracy                           0.80     22792
   macro avg       0.74      0.69      0.71     22792
weighted avg       0.79      0.80      0.79     22792



0.805

### Check accuracy of Test Set

In [66]:
y_test_pred_nb = nb.predict(X_test)
confusion_matrix(y_test,y_test_pred_nb)

# Report the 'actual' vs 'prediction' in a confusion matrix
cm = np.array(confusion_matrix(y_test, y_test_pred_nb, labels=[1,0]))
confusion = pd.DataFrame(cm, index=['high_income', 'low_income'],
                         columns=['predicted_high_income','predicted_low_income'])
print(confusion)
print()
print(classification_report(y_test, y_test_pred_nb)) 

accuracy = accuracy_score(y_true=y_test, y_pred=y_test_pred_nb)
accuracy.round(3)

             predicted_high_income  predicted_low_income
high_income                   1121                  1212
low_income                     635                  6801

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      7436
           1       0.64      0.48      0.55      2333

    accuracy                           0.81      9769
   macro avg       0.74      0.70      0.71      9769
weighted avg       0.80      0.81      0.80      9769



0.811