## 1. Import and load

In [100]:
import pandas as pd
import numpy as np

In [101]:
nls97 = pd.read_csv('data/nls97.csv')
nls97.set_index('personid', inplace=True)

In [102]:
nls97.loc[:, nls97.dtypes == 'object'] = nls97.select_dtypes(['object']).apply(lambda x: x.astype('category'))

nls97.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8984 non-null   object 
 1   birthmonth             8984 non-null   int64  
 2   birthyear              8984 non-null   int64  
 3   highestgradecompleted  6663 non-null   float64
 4   maritalstatus          6672 non-null   object 
 5   childathome            4791 non-null   float64
 6   childnotathome         4791 non-null   float64
 7   wageincome             5091 non-null   float64
 8   weeklyhrscomputer      5792 non-null   object 
 9   weeklyhrstv            6711 non-null   object 
 10  nightlyhrssleep        6706 non-null   float64
 11  satverbal              1406 non-null   float64
 12  satmath                1407 non-null   float64
 13  gpaoverall             6004 non-null   float64
 14  gpaenglish             5798 non-null   float64
 15  gp

In [103]:
object_cols = nls97.select_dtypes(['object']).columns
nls97[object_cols] = nls97[object_cols].astype('category')

nls97.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   birthmonth             8984 non-null   int64   
 2   birthyear              8984 non-null   int64   
 3   highestgradecompleted  6663 non-null   float64 
 4   maritalstatus          6672 non-null   category
 5   childathome            4791 non-null   float64 
 6   childnotathome         4791 non-null   float64 
 7   wageincome             5091 non-null   float64 
 8   weeklyhrscomputer      5792 non-null   category
 9   weeklyhrstv            6711 non-null   category
 10  nightlyhrssleep        6706 non-null   float64 
 11  satverbal              1406 non-null   float64 
 12  satmath                1407 non-null   float64 
 13  gpaoverall             6004 non-null   float64 
 14  gpaenglish             5798 non-null  

## 2. Select a column using the pandas `[]` bracket operator, and the `loc` and `iloc` accessors.

We pass a string matching a column name to the bracket operator to return a pandas series.

If we pass a list of one element with that column name (`nls97[['gender']]`), a `DataFrame` is returned. We can also use the `loc` and `iloc` accessors to select columns

In [104]:
analysis_demo = nls97['gender']
type(analysis_demo)

pandas.core.series.Series

In [105]:
analysis_demo = nls97[['gender']]
type(analysis_demo)

pandas.core.frame.DataFrame

In [106]:
analysis_demo = nls97.loc[:, ['gender']]
type(analysis_demo)

pandas.core.frame.DataFrame

In [107]:
analysis_demo = nls97.iloc[:, [0]]
type(analysis_demo)

pandas.core.frame.DataFrame

## 3. Select multiple columns from a pandas DataFrame

Use the bracket operator and `loc` to select a few columns

In [108]:
analysis_demo = nls97[['gender', 'maritalstatus', 'highestgradecompleted']]
analysis_demo.shape

(8984, 3)

In [109]:
analysis_demo.head()

Unnamed: 0_level_0,gender,maritalstatus,highestgradecompleted
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,Female,Married,13.0
100139,Male,Married,12.0
100284,Male,Never-married,7.0
100292,Male,,
100583,Male,Married,13.0


In [110]:
analysis_demo = nls97.loc[:, ['gender', 'maritalstatus', 'highestgradecompleted']]
analysis_demo.shape

(8984, 3)

In [111]:
analysis_demo.head()

Unnamed: 0_level_0,gender,maritalstatus,highestgradecompleted
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,Female,Married,13.0
100139,Male,Married,12.0
100284,Male,Never-married,7.0
100292,Male,,
100583,Male,Married,13.0


## 4. Select multiple columns based on a list of columns
If you are selecting more than a few columns, it is helpful to create the list of column names separately.

Here, we create a `key_vars` list of key variables for analysis:

In [112]:
key_vars = ['gender', 'maritalstatus', 'highestgradecompleted', 'wageincome', 'gpaoverall', 'weeksworked17',
            'colenroct17']

analysis_keys = nls97[key_vars]
analysis_keys.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   highestgradecompleted  6663 non-null   float64 
 3   wageincome             5091 non-null   float64 
 4   gpaoverall             6004 non-null   float64 
 5   weeksworked17          6670 non-null   float64 
 6   colenroct17            6734 non-null   category
dtypes: category(3), float64(4)
memory usage: 377.8 KB


## 5. Select one or more columns by filtering on column names.

Select all of the `weeksworked##` columns using the filter operator:

In [113]:
analysis_work = nls97.filter(like='weeksworked')
analysis_work.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   weeksworked00  8603 non-null   float64
 1   weeksworked01  8564 non-null   float64
 2   weeksworked02  8556 non-null   float64
 3   weeksworked03  8490 non-null   float64
 4   weeksworked04  8458 non-null   float64
 5   weeksworked05  8403 non-null   float64
 6   weeksworked06  8340 non-null   float64
 7   weeksworked07  8272 non-null   float64
 8   weeksworked08  8186 non-null   float64
 9   weeksworked09  8146 non-null   float64
 10  weeksworked10  8054 non-null   float64
 11  weeksworked11  7968 non-null   float64
 12  weeksworked12  7747 non-null   float64
 13  weeksworked13  7680 non-null   float64
 14  weeksworked14  7612 non-null   float64
 15  weeksworked15  7389 non-null   float64
 16  weeksworked16  7068 non-null   float64
 17  weeksworked17  6670 non-null   float64
dtypes: flo

## 6. Select all columns with the `category` data type.

Use the `select_dtypes` method to select columns by data type:

In [114]:
analysis_cat = nls97.select_dtypes(include=['category'])
analysis_cat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 57 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   weeklyhrscomputer      5792 non-null   category
 3   weeklyhrstv            6711 non-null   category
 4   highestdegree          8953 non-null   category
 5   govprovidejobs         1833 non-null   category
 6   govpricecontrols       1859 non-null   category
 7   govhealthcare          1874 non-null   category
 8   govelderliving         1872 non-null   category
 9   govindhelp             1815 non-null   category
 10  govunemp               1811 non-null   category
 11  govincomediff          1775 non-null   category
 12  govcollegefinance      1875 non-null   category
 13  govdecenthousing       1847 non-null   category
 14  govprotectenvironment  1860 non-null  

## 7. Select all columns with numeric data types:

In [115]:
analysis_nums = nls97.select_dtypes(include=['number'])
analysis_nums.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8984 entries, 100061 to 999963
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   birthmonth             8984 non-null   int64  
 1   birthyear              8984 non-null   int64  
 2   highestgradecompleted  6663 non-null   float64
 3   childathome            4791 non-null   float64
 4   childnotathome         4791 non-null   float64
 5   wageincome             5091 non-null   float64
 6   nightlyhrssleep        6706 non-null   float64
 7   satverbal              1406 non-null   float64
 8   satmath                1407 non-null   float64
 9   gpaoverall             6004 non-null   float64
 10  gpaenglish             5798 non-null   float64
 11  gpamath                5766 non-null   float64
 12  gpascience             5684 non-null   float64
 13  weeksworked00          8603 non-null   float64
 14  weeksworked01          8564 non-null   float64
 15  we

## 8. Organize columns using lists of column names.

Use lists to organize the columns in your `DataFrame`. 

You can easily change the order of columns or exclude some columns in this way.

Here, we move the columns in the `demoadult` list to the front:

In [None]:
demo = ['gender', 'birthmonth', 'birthyear']

highschool_record = ['satverbal', 'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience']

gov_resp = ['govprovidejobs', 'govpricecontrols', 'govhealthcare', 'govelderliving', 'govindhelp', 'govunemp',
            'govincomediff', 'govcollegefinance', 'govdecenthousing', 'govprotectenvironment']

demo_adult = ['highestgradecompleted', 'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
              'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep', 'highestdegree']

# and then group them up together