In [17]:
from matplotlib.colors import Normalize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
adult_census = pd.read_csv(r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv')

In [None]:
adult_census.head()

In [None]:
target_column = 'class'
adult_census[target_column].value_counts()

In [None]:
features = adult_census.drop(columns='class')
features.head()

In [None]:
numeric_columns = features.select_dtypes(include=np.number).columns.values
categorical_columns = features.drop(columns=numeric_columns).columns.values

In [None]:
print(f'''
There are {features.shape[0]} observations and {features.shape[1]} features.

Numeric features: {', '.join(numeric_columns)}.

Categorical features: {', '.join(categorical_columns)}.
''')

In [None]:
adult_census.hist(figsize=(20,14));

In [None]:
adult_census['sex'].value_counts()

In [None]:
adult_census['education'].value_counts()

In [None]:
pd.crosstab(
    index=adult_census['education'],
    columns=adult_census['education-num']
)

In [None]:
# We will plot a subset of the data to keep the plot readable and make the
# plotting faster
n_samples_to_plot = 5000
columns = ['age', 'education-num', 'hours-per-week']
sns.pairplot(data=adult_census[:n_samples_to_plot], vars=columns,
             hue=target_column, plot_kws={'alpha': 0.2},
             height=3, diag_kind='hist', diag_kws={'bins': 30});

In [None]:
ax = sns.scatterplot(
    x="age", y="hours-per-week", data=adult_census[:n_samples_to_plot],
    hue="class", alpha=0.5,
)

age_limit = 27
plt.axvline(x=age_limit, ymin=0, ymax=1, color="black", linestyle="--")

hours_per_week_limit = 40
plt.axhline(
    y=hours_per_week_limit, xmin=0.18, xmax=1, color="black", linestyle="--"
)

plt.annotate("<=50K", (17, 25), rotation=90, fontsize=35)
plt.annotate("<=50K", (35, 20), fontsize=35)
plt.annotate("???", (45, 60), fontsize=35);

Modelling with scikit-learn 

In [None]:
target_col = "class"
feature_col = adult_census.drop(columns=target_col).select_dtypes(np.number).columns.values

In [None]:
target = adult_census[target_col]
target

In [None]:
features = adult_census[feature_col]
features

In [None]:
print(
    f"The dataset contains {features.shape[0]} samples and "
    f"{features.shape[1]} features"
)

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# define the algorithm
model = KNeighborsClassifier()

# fit the model
model.fit(features, target)

target_predicted = model.predict(features)
target_predicted
# accuracy of first 5 predictions
target[:5] == target_predicted[:5]

(target == target_predicted).mean()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, 
    test_size=0.25,
    stratify=target
)

In [None]:
y_train.shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
y_train.value_counts(normalize=True)

KNeighbors Classifier

In [None]:
model = KNeighborsClassifier()

model.fit(X_train, y_train)

# score model on test data
accuracy = model.score(X_test, y_test)

print(f'The test accuracy using {model.__class__.__name__} is {round(accuracy, 4) * 100}%')

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

accuracy_logistic = model.score(X_test, y_test)
accuracy_logistic

In [25]:
# Modular Code
def get_features_and_target():
    '''Split a CSV into a DF of numeric features and a target column.'''
    adult_census = pd.read_csv(r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv')

    target_col = "class"
    
    raw_features = adult_census.drop(columns=target_col)
    numeric_features = raw_features.select_dtypes(np.number)
    feature_cols = numeric_features.columns.values

    features = adult_census[feature_cols]
    target = adult_census[target_col]
    
    return (features, target)

In [19]:
f, t = get_features_and_target()
f.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,25,7,0,0,40
1,38,9,0,0,50
2,28,12,0,0,40
3,44,10,7688,0,40
4,18,10,0,0,30


In [None]:
t.head()

In [22]:
def get_features_and_target(csv_file, target_col):
    '''Split a CSV into a DF of numeric features and a target column.'''
    
    adult_census = pd.read_csv(csv_file)
    
    raw_features = adult_census.drop(columns=target_col)
    numeric_features = raw_features.select_dtypes(np.number)
    feature_cols = numeric_features.columns.values

    features = adult_census[feature_cols]
    target = adult_census[target_col]
    
    return (features, target)

In [23]:
f, t = get_features_and_target(
    csv_file=r'/Users/oluwa/Desktop/Python/advanced-python-datasci-master/data/adult-census.csv',
    target_col='class',
)

In [None]:
features.shape

In [None]:
f.head()

In [None]:
t.head()

In [None]:
import my_module

In [None]:
x = my_module.reverse_and_capitalize('oluwatimilehin')

In [None]:
x

In [2]:
import my_module

In [4]:
features, target = my_module.get_features_and_target(csv_file='../data/adult-census.csv', target_col='class')

In [5]:
features

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,25,7,0,0,40
1,38,9,0,0,50
2,28,12,0,0,40
3,44,10,7688,0,40
4,18,10,0,0,30
...,...,...,...,...,...
48837,27,12,0,0,38
48838,40,9,0,0,40
48839,58,9,0,0,40
48840,22,9,0,0,20


In [6]:
target

0         <=50K
1         <=50K
2          >50K
3          >50K
4         <=50K
          ...  
48837     <=50K
48838      >50K
48839     <=50K
48840     <=50K
48841      >50K
Name: class, Length: 48842, dtype: object

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    target, 
    random_state=123, 
    train_size=0.75,
    stratify=target
)

In [9]:
X_train.shape

(36631, 5)

In [10]:
features.shape

(48842, 5)

In [11]:
target

0         <=50K
1         <=50K
2          >50K
3          >50K
4         <=50K
          ...  
48837     <=50K
48838      >50K
48839     <=50K
48840     <=50K
48841      >50K
Name: class, Length: 48842, dtype: object

In [12]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8258946851199738

Feature Engineering

In [13]:
from sklearn import set_config
set_config(display='diagram')

In [26]:
# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

NameError: name 'adult_census' is not defined

In [27]:
from sklearn.compose import make_column_selector as selector

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [28]:
# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

In [29]:
# results in a list containing relevant column names
numerical_columns

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [30]:
numerical_features = features[numerical_columns]
numerical_features.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,10.078089,1079.067626,87.502314,40.422382
std,13.71051,2.570973,7452.019058,403.004552,12.391444
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(numerical_features)

In [32]:
scaler.mean_

array([  38.64358544,   10.07808853, 1079.06762622,   87.50231358,
         40.42238238])