In [1]:
import pandas as pd

filepath1 = 'Exports/Data/7.TractMasterDataset.csv'
filepath2 ='Exports/Data/6.CountyMasterDataset.csv'

tract_data = pd.read_csv(filepath1)
county_data = pd.read_csv(filepath2)

***County Data***

In [2]:
county_data.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,STRATA,GQ,...,STATEFP,COUNTYFP,weighted_ave_HH_conc,weighted_ave_non_HH_conc,weighted_sum_HH_conc,weighted_sum_non_HH_conc,non_weighted_ave_HH_conc,non_weighted_ave_non_HH_conc,non_weighted_sum_HH_conc,non_weighted_sum_non_HH_conc
0,2010,201001,2.0,80.0,97.0,2010000000021,1.0,97.0,220001,1,...,1.0,97.0,0.205318,0.794682,2441.1,9448.275,0.205759,0.794241,343.0,1324.0
1,2010,201001,2.0,80.0,97.0,2010000000021,1.0,97.0,220001,1,...,1.0,97.0,0.205318,0.794682,2441.1,9448.275,0.205759,0.794241,343.0,1324.0
2,2010,201001,2.0,80.0,97.0,2010000000021,1.0,97.0,220001,1,...,1.0,97.0,0.205318,0.794682,2441.1,9448.275,0.205759,0.794241,343.0,1324.0
3,2010,201001,4.0,224.0,82.0,2010000000041,1.0,117.0,130001,1,...,1.0,117.0,,,,,0.075431,0.924569,35.0,429.0
4,2010,201001,4.0,224.0,82.0,2010000000041,1.0,117.0,130001,1,...,1.0,117.0,,,,,0.075431,0.924569,35.0,429.0


In [3]:
# Restrict data to rows where BPL == STATEFP
mask = county_data['BPL'] == county_data['STATEFP']
print(f'Share of individuals living in birth state: {mask.mean():.2%}')
print('dropping...')
county_data = county_data[mask].copy()

Share of individuals living in birth state: 53.34%
dropping...


2010 is 13 years after 1997. Music influences people from the age of 10-17. So I will restrict sample to people aged 23-30

In [4]:
county_data.query(" 23 <= AGE <= 30", inplace=True)

***Regressing***

In [5]:
print(f'{county_data['weighted_ave_HH_conc'].isna().mean():.2%} of rows have missing rating data')
hip_hop_cols = [col for col in county_data.columns if 'HH_conc' in col and 'non_weighted' in col]
print(hip_hop_cols)

84.84% of rows have missing rating data
['non_weighted_ave_HH_conc', 'non_weighted_ave_non_HH_conc', 'non_weighted_sum_HH_conc', 'non_weighted_sum_non_HH_conc']


In [6]:
outcome_and_control_cols = [
    'SEX', 'AGE', 'MARST', 'RACE', 'HISPAN',
    'EDUC', 'EMPSTAT',
    'INCTOT'
]

In [7]:
# Select only the necessary columns: dependent_var, control_cols, and hip_hop_var
hip_hop_var = 'non_weighted_ave_HH_conc'
required_cols = outcome_and_control_cols + [hip_hop_var] + ['STATEFP','COUNTYFP']
print(f'rows dropped: {county_data[required_cols].isna().any(axis=1).mean():.2%}')
data_subset = county_data[required_cols].dropna()

rows dropped: 2.76%


In [8]:
data_subset1 = data_subset.copy()
data_subset1.head()

Unnamed: 0,SEX,AGE,MARST,RACE,HISPAN,EDUC,EMPSTAT,INCTOT,non_weighted_ave_HH_conc,STATEFP,COUNTYFP
1,2,26.0,1,1.0,0,7,1,13000.0,0.205759,1.0,97.0
4,2,29.0,1,1.0,0,7,1,45000.0,0.075431,1.0,117.0
34,2,27.0,6,1.0,0,6,3,2500.0,0.0,1.0,55.0
35,2,26.0,6,2.0,0,10,1,34000.0,0.086671,1.0,73.0
36,1,27.0,6,2.0,0,10,1,40000.0,0.086671,1.0,73.0


In [9]:
min_value = data_subset1[hip_hop_var].min()
max_value = data_subset1[hip_hop_var].max()
data_subset1[hip_hop_var] = (data_subset1[hip_hop_var] - min_value) / (max_value - min_value)

# Cluster identifiers
data_subset1['clusters'] = data_subset1['STATEFP'] * 1e5 + data_subset1['COUNTYFP']
data_subset1.drop(columns=['STATEFP', 'COUNTYFP'], inplace=True)


data_subset1.head()

Unnamed: 0,SEX,AGE,MARST,RACE,HISPAN,EDUC,EMPSTAT,INCTOT,non_weighted_ave_HH_conc,clusters
1,2,26.0,1,1.0,0,7,1,13000.0,0.421994,100097.0
4,2,29.0,1,1.0,0,7,1,45000.0,0.154703,100117.0
34,2,27.0,6,1.0,0,6,3,2500.0,0.0,100055.0
35,2,26.0,6,2.0,0,10,1,34000.0,0.177755,100073.0
36,1,27.0,6,2.0,0,10,1,40000.0,0.177755,100073.0


In [None]:
import matplotlib.pyplot as plt

data_subset1.groupby('clusters').size().value_counts().hist()
plt.xlabel('Cluster Size (Distribution of Observations per County)')
plt.ylabel('Frequency (Number of Counties)')
plt.title('Number of Observations per County')
plt.show()

In [11]:
data_subset1.drop(columns=['clusters'], inplace=True)

In [None]:
import statsmodels.api as sm

# dependent_var_options = [('EDUC', 6), ('INCTOT', None), ('MARST', 4), ('MARST', 6), ('EMPSTAT', 2)]
dependent_var_options = [('INCTOT', None)]


# Iterate through dependent variables and outcomes
for dependent_var, dependent_var_outcome in dependent_var_options:
    print(f"Dependent variable: {dependent_var}")

    # Iterate through unique RACE values
    for race_value in [1,2]:
        print(f"RACE: {race_value}")

        # Filter data for the current RACE group
        data_subset = data_subset1[data_subset1['RACE'] == race_value].copy()

        control_cols = [col for col in outcome_and_control_cols if col != dependent_var]

        # Convert dependent variable into a binary variable
        if dependent_var == 'EDUC':
            data_subset[dependent_var] = (data_subset[dependent_var] < dependent_var_outcome).astype(int)
        elif dependent_var_outcome is not None:
            data_subset[dependent_var] = (data_subset[dependent_var] == dependent_var_outcome).astype(int)
        else:
            data_subset[dependent_var] = data_subset[dependent_var].astype(int)

        for col in data_subset.columns:
            if col not in [dependent_var, 'AGE', 'clusters', 'RACE'] and (
                data_subset[col].dtype == 'object' or data_subset[col].nunique() < 10
            ):  # Categorical criteria
                dummies = pd.get_dummies(data_subset[col], prefix=col, drop_first=True)
                data_subset = pd.concat([data_subset.drop(columns=[col]), dummies], axis=1)

        # Separate dependent and independent variables
        y = data_subset[dependent_var]
        X_all = sm.add_constant(data_subset.drop(columns=[dependent_var, 'RACE']))  # All potential independent variables
        assert X_all.shape[0] == y.shape[0]
        assert not X_all.isna().any().any()
        assert not y.isna().any()
        X_all = X_all.astype(float)

        # Initialize a dictionary to store regression results
        regression_results = {}

        independent_vars = [hip_hop_var] + [col for col in X_all.columns if col not in hip_hop_cols]
        X = X_all[independent_vars]  # Subset only relevant columns

        # Perform the regression
        model = sm.OLS(y, X)
        results = model.fit()

        # Store the results in the dictionary
        regression_results[(dependent_var, race_value, hip_hop_var)] = results

        # Print summary for each regression
        print(f"Regression for dependent variable {dependent_var}, independent variable {hip_hop_var}, and RACE: {race_value}:")
        print(results.summary())
        print("\n" + "=" * 80 + "\n")

In [None]:
# Extract results into a DataFrame
coefficients = results.params
p_values = results.pvalues
summary_df = pd.DataFrame({
    'Coefficient': coefficients,
    'P-Value': p_values
})

# Sort or group by variable types
summary_df = summary_df.sort_index()

# Define the mapping dictionaries
mapping = {
    'SEX_1': 'Male',
    'SEX_2': 'Female',
    'EMPSTAT_2.0': 'Unemployed',
    'EMPSTAT_3.0': 'Not in labor force',
    'HISPAN_1.0': 'Mexican',
    'HISPAN_2.0': 'Puerto Rican',
    'HISPAN_3.0': 'Cuban',
    'HISPAN_4.0': 'Other Hispanic',
    'MARST_2.0': 'Married, spouse absent',
    'MARST_3.0': 'Separated',
    'MARST_4.0': 'Divorced',
    'MARST_5.0': 'Widowed',
    'MARST_6.0': 'Never married/single',
    'RACE_2.0': 'Black/African American',
    'RACE_3.0': 'American Indian or Alaska Native',
    'RACE_4.0': 'Chinese',
    'RACE_5.0': 'Japanese',
    'RACE_6.0': 'Other Asian or Pacific Islander',
    'RACE_7.0': 'Other race, nec',
    'RACE_8.0': 'Two major races',
    'const': 'Intercept',
    'INCTOT': 'Total Personal Income',
    'non_weighted_ave_HH_conc': 'Hip Hop Exposure During Adolescence',
    'AGE': 'Age',
    # Add age mappings for clarity (AGE_24.0 -> Age 24, etc.)
    **{f'AGE_{i}.0': f'Age {i}' for i in range(24, 31)}
}

# Update indices in summary_df
summary_df = summary_df.rename(index=mapping)

summary_df['P-Value'] = summary_df['P-Value'].map(lambda p: f'{p:.4f}')
summary_df['Coefficient'] = summary_df['Coefficient'].map(lambda c: f'{c:.4f}')

summary_df.reset_index(inplace=True)
summary_df.rename(columns={'index': 'Variable'}, inplace=True)

summary_df.sort_values('P-Value', inplace=True)

summary_df

In [None]:
import matplotlib.pyplot as plt

# Reset index to make 'Coefficient' a column
summary_df_reset = summary_df.reset_index()

# Rename columns for clarity
summary_df_reset.rename(columns={'index': 'Variable'}, inplace=True)

# Plot all coefficients
plt.figure(figsize=(10, 8))
summary_df_reset.set_index('Variable')['Coefficient'].plot(
    kind='bar', 
    color='skyblue', 
    yerr=summary_df_reset['P-Value'],  # Use P-Value or Std Err for error bars if applicable
    capsize=4
)
plt.title('Effect of Independent Variables on High School Graduation')
plt.ylabel('Coefficient')
plt.xlabel('Variables')
plt.axhline(0, color='red', linestyle='--', linewidth=1)
plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

In [None]:
significant = summary_df[summary_df['P-Value'] < 0.05]
print("Significant Variables:\n", significant)