#Activity 03 - Datasets

***
##### CS 434 - Dating Mining and Machine Learning
##### Oregon State University-Cascades
***


# Load Packages

In [0]:
import ssl
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
ssl._create_default_https_context = ssl._create_unverified_context

# Dataset

### Location

https://archive.ics.uci.edu/ml/datasets/census+income

### Description

Predict whether income exceeds $50K/yr based on census data. Also known as "Adult" dataset.

> Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean records was extracted using the following conditions: `((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))`

### Attributes
* `age`: continuous
* `workclass`: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
* `fnlwgt`: continuous
* `education`: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* `education-num`: continuous.
* `marital-status`: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
* `occupation`: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* `relationship`: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
* `race`: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
* `sex`: Female, Male.
* `capital-gain`: continuous.
* `capital-loss`: continuous.
* `hours-per-week`: continuous.
* `native-country`: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

### Class label

* `>50K, <=50K`: binary label, true if annual income is below $50k

In [0]:
attributes = ['age','workclass','fnlwgt','education','education-value','marital-status','occupation','relationship','race', 'sex','capital-gain','capital-loss','hours-per-week','native-country', 'income']
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

*** 
# Exercise #1 - Load data
*** 

##### 1.1 Load the dataset from the url and `display(df)`. 

In [0]:
# load the dataset into a dataframe
print('your code here')

##### 1.2 Count the number of records

In [0]:
# count number of records
print('your code here')

##### 1.3 Create a mapping from the `'income'` strings to integers

In [0]:
# create a mapping dict to convert class labels from strings to integers
print('your code here')

##### 1.4 Convert `'income'` strings to integers

In [0]:
# convert class labels from strings to integers using the mapping
print('your code here')

##### 1.5 List the `unique()` values of `'education'`

In [0]:
# list the unique values of 'education'
print('your code here')

> Note the extra space in the string.  We'll convert this data column in the next section. 

*** 
# Exercise #2 - Mapping ordinal data
*** 

##### 2.1 Count the values in `'education`' using `value_counts()`

In [0]:
# count the values in 'education'
print('your code here')

##### 2.2 Select and display only the `'education'` and `'education-value'` columns

In [0]:
# display two columns: 'education' and 'education-value'
print('your code here')

> Notice that `'eduction-value'` is an ordinal representation of `'education'`.
>
> For practice, let's convert the `'education'` column to an ordinal numbering and then we can check it against `'education-value'`.

##### 2.3 Define a map that maps the categorical `'education'` value to an ordinal number. For example `' Preschool' : 1'`.

In [0]:
# define an ordinal mapping for education
education_mapping = {' Preschool' : 1, ' 1st-4th' : 2, ' 5th-6th' : 3, ' 7th-8th' : 4, 
                ' 9th' : 5, ' 10th' : 6, ' 11th' : 7, ' 12th' : 8, 
                
                ###################
                # fill in the rest                
                ###################

                }

> Again note the extra space in the string.

##### 2.4 Apply the ordinal mapping using `map`

In [0]:
# apply the mapping and display dataframe
print('your code here')

##### 2.5 Confirm your new mapping of `'education'` matches the given mapping.

In [0]:
# True, if the two columns match 
df['education'].equals(df['education-value'])

***
# Exercise #3 - One-hot encoding 
***

##### 3.1 Use to `get_dummies(.)` for `'sex'` and store in new dataframe `dummies`

In [0]:
# create dummies for 'sex'
print('your code here')

##### 3.2 Drop column 'sex'

In [0]:
# drop column 'sex' as it is now encoded
print('your code here')

##### 3.3 `join(.)` dataframe `dummies` with `df` and `display`

In [0]:
# join the encoded df dummies with df
print('your code here')

##### 3.4 Rename columns to be named `{male, female}`

> **Example**: rename columns
> <pre>$df = df.rename({'a': 'x', 'b': 'y'}, axis=1)</pre>



In [0]:
# rename to the lower case equivalent 
print('your code here')

##### 3.5 Find the sums of the `female` and the `male` columns, respectively

In [0]:
# count female
print('your code here')

In [0]:
# count male
print('your code here')

***
# Exercise #4 - Features on same scale
***

##### 4.1 Describe the summary statistics for `'hours-per-week'`

In [0]:
# describe 'hours-per-week'
print('your code here')

###### 4.2 Standardize the column `'hour-per-week'`

In [0]:
# use StandardScaler to fit tranform the column 'hours-per-week'
print('your code here')

##### 4.3 Describe the summary statistics for `'hours-per-week'` again

In [0]:
# describe 'hours-per-week' again
print('your code here')

##### 4.4 Standardize `'captial-gain'` and `'capital-loss'`

In [0]:
# standardize 'captial-gain' and 'capital-loss'
print('your code here')

##### 4.5 Describe the summary statistics for `'captial-gain'` and `'capital-loss'`

In [0]:
# describe 'captial-gain' and 'capital-loss'
print('your code here')

***
# Exercise #5 - Partition into train and test sets
***

##### 5.1 Partition in into `X` (features) and `y` (class labels)

In [0]:
# partition into attributes X and class labels y
print('your code here')

##### 5.2 Split into train and test (0.3), stratified by `y`

In [0]:
# split into train and test (both X and y)
print('your code here')

##### 5.3 Count the number of examples in `X_train`

In [0]:
# count number of examples in X_train
print('your code here')

##### 5.4 Count the number of examples in `X_test`



In [0]:
# count number of examples in X_test
print('your code here')

##### 5.5 Verify the size of `X_test` is $ \approx{0.3} $ of the total number of examples

In [0]:
# divide |X_test| by (|X_train|+|X_test|)
print('your code here')

<img src="https://66.media.tumblr.com/dded9d1a2bf2068f92af9f7a9b6b5451/tumblr_p6s3hbPzgV1vd8jsjo1_500.gifv" width="300">