Environment Set-up

In [1]:
import pandas as pd
import pyreadstat
from sas7bdat import SAS7BDAT
import numpy as np

Importing Data

In [2]:
acs_raw = pd.read_csv("Data/ACS_5YR/2018_2022/psam_p06.csv")

In [3]:
with SAS7BDAT('Data/CHIS Dummy/Adult 2022/dummy_adult.sas7bdat') as file:
    chis_raw = file.to_data_frame()

In [7]:
acs_raw['SERIALNO'].head()

0    2018GQ0000004
1    2018GQ0000013
2    2018GQ0000016
3    2018GQ0000020
4    2018GQ0000027
Name: SERIALNO, dtype: object

Data Manipulation Toolbox - DataToolBox

In [5]:
import numpy as np
import pandas as pd

class DataToolBox:
    def __init__(self, data):
        """
        Initialize the DataToolBox with a dataset.

        :param data: A pandas DataFrame that contains the data to be analyzed and manipulated.
        """
        self.data = data

    def return_data(self):
        """
        Return the current state of the data stored in the toolbox.

        :return: The current pandas DataFrame stored within the tool.
        """
        return self.data

    def data_desc(self):
        """
        Print a description of the current dataset including the number of observations (rows) and variables (columns).
        """
        temp = self.data.shape
        print("---------Current Data State----------")
        print(temp[0], "obs;", temp[1], "vars")
        print("")

    def data_exclude(self, condition: str):
        """
        Exclude observations from the data based on a given condition and updates the dataset.

        :param condition: A string representing the condition to be used for filtering the data.
                          Observations meeting this condition will be excluded.
        """
        temp = self.data.query(condition)
        temp_new_obs = temp.shape[0]
        temp_old_obs = self.data.shape[0]
        temp_diff_obs = temp_old_obs - temp_new_obs

        print("---------Obs Filter-----------------")
        print("applying condition: ", condition)
        print(temp_diff_obs, "/", temp_old_obs, "cases were removed")
        print("new obs #: ", temp_new_obs)
        print("")

        self.data = temp

    def data_construct(self, col_name, conditions_str, choices, default=-1):
        """
        Construct a new column in the data based on multiple conditions.

        :param col_name: Name of the new column to be added.
        :param conditions_str: A list of conditions (as strings) that determine the value to be assigned.
        :param choices: A list of values to be assigned based on the conditions.
        :param default: The default value to be assigned if none of the conditions are met. Default is -1.
        """
        temp_df = self.data.copy()

        conditions = [temp_df.eval(condition) for condition in conditions_str]
        temp_df[col_name] = np.select(conditions, choices, default=default)

        self.data = temp_df

    def freq_1way(self, col_name):
        """
        Print the frequency distribution of a single column.

        :param col_name: The name of the column for which the frequency distribution is to be calculated.
        """
        temp_df = self.data
        frequency = temp_df[col_name].value_counts()
        print(frequency)

    def freq_2way(self, col_name_1, col_name_2):
        """
        Print a two-way frequency table (crosstab) between two columns.

        :param col_name_1: The name of the first column.
        :param col_name_2: The name of the second column.
        """
        temp_df = self.data
        crosstab_out = pd.DataFrame(pd.crosstab(temp_df[col_name_1], temp_df[col_name_2]))
        print(crosstab_out)


In [10]:
acs_working = acs_raw.copy()

conditions = [
    acs_raw['SERIALNO'].str[4:6] == 'GQ',  # Condition for 'GQ'
    acs_raw['SERIALNO'].str[4:6] == 'HU'   # Condition for 'HU'
]

choices = [1, 0]

acs_working['INGRPQ'] = np.select(conditions, choices, default = -1)

In [15]:
acs_working['DIS']

0          2
1          2
2          1
3          1
4          2
          ..
1839923    2
1839924    2
1839925    2
1839926    2
1839927    2
Name: DIS, Length: 1839928, dtype: int64

ACS Processing

In [12]:
acs = DataToolBox(acs_working)
acs.data_desc()
acs.data_exclude('INGRPQ == 0')

---------Current Data State----------
1839928 obs; 291 vars

---------Obs Filter-----------------
applying condition:  INGRPQ == 0
89131 / 1839928 cases were removed
new obs #:  1750797



CHIS Processing

In [None]:
chis = DataToolBox(chis_raw)
chis.data_desc()

1 - HOME_LANG: Language Spoken at Home

In [13]:
acs.data_construct('HOME_LANG', ['HHL == 1', 'HHL == 2', 'HHL == 3', 'HHL == 4', 'HHL == 5'], [1, 2, 3, 4, 5])
acs.freq_1way('HOME_LANG')

UndefinedVariableError: name 'HHL' is not defined