In [1]:
# Import pandas with alias
import pandas as pd

In [2]:
# Read in the census dataframe
census = pd.read_csv('census_data.csv', index_col=0)

In [3]:
# DATA SCIENCE FOUNDATIONS II
# Census Variables
# You have decided to volunteer for your local community by offering to clean their recently collected census data. The description of this dataset is as follows:

# column	description
# first_name	The respondent’s first name.
# last_name	The respondent’s last name.
# birth_year	The respondent’s year of birth.
# voted	If the respondent participated in the current voting cycle.
# num_children	The number of children the respondent has.
# income_year	The average yearly income the respondent earns.
# higher_tax	The respondent’s answer to the question: “Rate your agreement with the statement: the wealthy should pay higher taxes.”
# marital_status	The respondent’s current marital status.

In [4]:

# The census dataframe is composed of simulated census data to represent demographics of a small community in the U.S. Call the .head() method on the census dataframe and print the output to view the first five rows.

In [5]:
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status
0,Denise,Ratke,2005,False,0,92129.41,disagree,single
1,Hali,Cummerata,1987,False,0,75649.17,neutral,divorced
2,Salomon,Orn,1992,True,2,166313.45,agree,single
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,married
4,Gust,Abernathy,1945,False,2,143316.08,agree,married


In [6]:

# Review the dataframe description and values returned by .head() to assess the variable types of each of the variables. This is an important step to understand what preprocessing will be necessary to work with the data.

In [7]:
#Compare the values returned from the .head() method with the data types of each variable by calling .dtypes on the census dataframe and print the result.

census.dtypes

first_name         object
last_name          object
birth_year         object
voted                bool
num_children        int64
income_year       float64
higher_tax         object
marital_status     object
dtype: object

In [8]:

# The manager of the census would like to know the average birth year of the respondents. We were able to see from .dtypes that birth_year has been assigned the str datatype whereas it should be expressed in int.

# Print the unique values of the variable using the .unique() method.

census.birth_year.unique()

array(['2005', '1987', '1992', '1965', '1945', '1951', '1963', '1949',
       '1950', '1971', '2007', '1944', '1995', '1973', '1946', '1954',
       '1994', '1989', '1947', '1993', '1976', '1984', 'missing', '1966',
       '1941', '2000', '1953', '1956', '1960', '2001', '1980', '1955',
       '1985', '1996', '1968', '1979', '2006', '1962', '1981', '1959',
       '1977', '1978', '1983', '1957', '1961', '1982', '2002', '1998',
       '1999', '1952', '1940', '1986', '1958'], dtype=object)

In [9]:
# try channging birth_year to int; it will fail because of the MISSING value
#census.birth_year = census.birth_year.astype('int')

In [10]:

# There appears to be a missing value in the birth_year column. With some research you find that the respondent’s birth year is 1967.

# Use the .replace() method to replace the missing value with 1967, so that the data type can be changed to int. Then recheck the values in birth_year by calling the .unique() method and printing the results.

census.birth_year.replace('missing', 1967, inplace=True)

In [11]:

#Now that we have adjusted the values in the birth_year variable, change the datatype from str to int and print the datatypes of the census dataframe with .dtypes.

census.birth_year = census.birth_year.astype(int)


In [12]:
# Having assigned birth_year to the appropriate data type, print the average birth year of the respondents to the census using the pandas .mean() method.

census.describe() # expect mean value 1973

census.birth_year.mean() # 1973.4


1973.4

In [13]:
# Your manager would like to set an order to the higher_tax variable so that: strongly disagree < disagree < neutral < agree < strongly agree.

# Convert the higher_tax variable to the category data type with the appropriate order, then print the new order using the .unique() method.

census.higher_tax.unique()

array(['disagree', 'neutral', 'agree', 'strongly agree',
       'strongly disagree'], dtype=object)

In [14]:
census.higher_tax.unique()

array(['disagree', 'neutral', 'agree', 'strongly agree',
       'strongly disagree'], dtype=object)

In [15]:


census.higher_tax = pd.Categorical(census.higher_tax,['strongly disagree', 'disagree' , 'neutral', 'agree', 'strongly agree'], ordered = True)

In [16]:
# 
# Your manager is interested in using machine learning models on the census data in the future. To help, let’s One-Hot Encode marital_status to create binary variables of each category. Use the pandas get_dummies() method to One-Hot Encode the marital_status variable.

# Print the first five rows of the new dataframe with the .head() method. Note that you’ll have to scroll to the right or expand the web-browser to see the dummy variables.

In [17]:
# census=pd.get_dummies(data=census.marital_status) #NO, this clobbers the df
# census.marital_status_bool=pd.get_dummies(data=census.marital_status)  # Doesnt work
census=pd.get_dummies(data=census, columns=['marital_status'])  # this is the correct method: data source columns must be named seperately

In [18]:

# Congratulations! You have used your variable skills to help the census team with managing their data. Feel free to explore the data further. There are additional operations you can perform on the data, such as:

# Create a new variable called marital_codes by Label Encoding the marital_status variable. This could help the Census team use machine learning to predict if a respondent thinks the wealthy should pay higher taxes based on their marital status.

# Create a new variable called age_group, which groups respondents based on their birth year. The groups should be in five-year increments, e.g., 25-30, 31-35, etc. Then label encode the age_group variable to assist the Census team in the event they would like to use machine learning to predict if a respondent thinks the wealthy should pay higher taxes based on their age group.

In [19]:
# We'll do three agre groups, and then we move on...

census.birth_year

0     2005
1     1987
2     1992
3     1965
4     1945
      ... 
95    1958
96    2001
97    1987
98    1985
99    1961
Name: birth_year, Length: 100, dtype: int32

In [20]:
# methods to isolate current year

import datetime

datetime.date.today()
datetime.datetime.now().year

#pd.DatetimeIndex.year
#census.age = ( current_y )
#pd.DatetimeIndex(df['Date Attribute']).year

2024

In [21]:
#census.age = ( datetime.datetime.now().year - census.birth_year ) # UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
census.age =  datetime.datetime.now().year

In [22]:
census.age # it's callable, but not visible in the df

2024

In [23]:
census.dtypes

first_name                   object
last_name                    object
birth_year                    int32
voted                          bool
num_children                  int64
income_year                 float64
higher_tax                 category
marital_status_divorced        bool
marital_status_married         bool
marital_status_single          bool
marital_status_widowed         bool
dtype: object

In [24]:
#census.describe

In [25]:
#census.age = 2024 - census.birth_year # this works, but still doesnt seem to add the column
census['age'] = 2024 - census.birth_year # THIS adds the column to the df; the bracket notation was the key

In [26]:
# test adding a new column ; not sure why this wouldnt work

static_value = 'Male'
census['occupation'] = static_value # that works fine ; the bracket notation does the trick. 

# good test, now drop the col
census.drop(columns=['occupation'], inplace=True
           )

In [27]:
census.age = ( datetime.datetime.now().year - census.birth_year )

In [28]:
# finally, we have the age. Now create the brackets
# there are several methods, but here we will use Pandas loc()




#df.loc[(df['Age'] >= 13) & (df['Age'] < 20), 'Age'] = 'Teenager'
census.loc[ (census['age'] < 18) , 'age_group'] = 'Under 18'
census.loc[ (census['age'] >= 18) & (census['age'] < 50) , 'age_group'] = '18 to 49'
census.loc[ (census['age'] >= 50)  , 'age_group'] = 'Over 50'

#works!

In [29]:
# Now 'One-Hot encode' the age group

census=pd.get_dummies(data=census, columns=['age_group'])

#works!

In [30]:
census.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   first_name               100 non-null    object  
 1   last_name                100 non-null    object  
 2   birth_year               100 non-null    int32   
 3   voted                    100 non-null    bool    
 4   num_children             100 non-null    int64   
 5   income_year              100 non-null    float64 
 6   higher_tax               100 non-null    category
 7   marital_status_divorced  100 non-null    bool    
 8   marital_status_married   100 non-null    bool    
 9   marital_status_single    100 non-null    bool    
 10  marital_status_widowed   100 non-null    bool    
 11  age                      100 non-null    int32   
 12  age_group_18 to 49       100 non-null    bool    
 13  age_group_Over 50        100 non-null    bool    
 14  age_group_Under 

In [31]:
census.head()

Unnamed: 0,first_name,last_name,birth_year,voted,num_children,income_year,higher_tax,marital_status_divorced,marital_status_married,marital_status_single,marital_status_widowed,age,age_group_18 to 49,age_group_Over 50,age_group_Under 18
0,Denise,Ratke,2005,False,0,92129.41,disagree,False,False,True,False,19,True,False,False
1,Hali,Cummerata,1987,False,0,75649.17,neutral,True,False,False,False,37,True,False,False
2,Salomon,Orn,1992,True,2,166313.45,agree,False,False,True,False,32,True,False,False
3,Sarina,Schiller,1965,False,2,71704.81,strongly agree,False,True,False,False,59,False,True,False
4,Gust,Abernathy,1945,False,2,143316.08,agree,False,True,False,False,79,False,True,False
