In [1]:
import pandas as pd
import seaborn as sns

# Adult Income Data

The data is on [UCI data repository][uciad]. These data were extracted by Barry Becker from the 1994 Census database and consist of the following features: age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, and salary. Of these, only five are continuous:  fnlwgt, education-num, capital-gain, capital-loss, and hours-per-week, the others are discrete. The last column, salary, is discrete and contains one of two strings to indicate if the salary was below or above $50,000.

-----
[uciad]: https://archive.ics.uci.edu/ml/datasets/Adult

In [20]:
# Adult data archived at UCI ML Repository

data_file = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
col_names = ['Age', 'Workclass', 'FNLWGT', 'Education', 
             'EducationLevel', 'MaritalStatus', 'Occupation', 
             'Relationship', 'Race', 'Sex', 'CapitalGain', 'CapitalLoss', 
             'HoursPerWeek', 'NativeCountry', 'Salary']

# Read CSV data from URL return Pandas
adult_data = pd.read_csv(data_file, index_col=False, names = col_names)
adult_data.to_csv('adult_income_original.csv', index=False)

adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
Age               32561 non-null int64
Workclass         32561 non-null object
FNLWGT            32561 non-null int64
Education         32561 non-null object
EducationLevel    32561 non-null int64
MaritalStatus     32561 non-null object
Occupation        32561 non-null object
Relationship      32561 non-null object
Race              32561 non-null object
Sex               32561 non-null object
CapitalGain       32561 non-null int64
CapitalLoss       32561 non-null int64
HoursPerWeek      32561 non-null int64
NativeCountry     32561 non-null object
Salary            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [29]:
adult_data.Salary.value_counts()

 <=50K    24720
 >50K      7841
Name: Salary, dtype: int64

In [64]:
#df = adult_data.sample(2000, random_state=23)
df = adult_data.sample(4000, random_state=11)
df.Salary.value_counts()

 <=50K    3084
 >50K      916
Name: Salary, dtype: int64

In [65]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Age,Workclass,FNLWGT,Education,EducationLevel,MaritalStatus,Occupation,Relationship,Race,Sex,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Salary
0,62,Local-gov,68268,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
1,50,Private,215990,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
2,36,Private,185405,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,50,United-States,>50K
3,64,Private,258006,Some-college,10,Widowed,Adm-clerical,Not-in-family,White,Female,0,0,40,Cuba,<=50K
4,28,Self-emp-not-inc,39388,Assoc-voc,11,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,<=50K


In [67]:
df.to_csv('adult_income.csv', index=False)
!ls -l

total 9536
-rw-r--r--  1 lindenlu  staff   468721 Sep 16 12:34 adult_income.csv
-rw-r--r--  1 lindenlu  staff   468721 Sep 16 11:16 adult_income2.csv
-rw-r--r--  1 lindenlu  staff  3811650 Sep 16 11:09 adult_income_original.csv
-rw-r--r--  1 lindenlu  staff    10966 Sep 16 11:18 prepare_data.ipynb


# MPG Data

In [3]:
mpg = sns.load_dataset('mpg')
mpg.dropna(inplace=True)
mpg.to_csv('mpg.csv', index=False)
!ls -l

total 8552
-rw-r--r--  1 lindenlu  staff   468721 Sep 16 12:34 adult_income.csv
-rw-r--r--  1 lindenlu  staff  3811650 Sep 16 11:09 adult_income_original.csv
-rw-r--r--  1 lindenlu  staff    20940 Sep 17 09:12 mpg.csv
-rw-r--r--  1 lindenlu  staff    10965 Sep 16 12:36 prepare_data.ipynb
