In [92]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)

In [93]:
# read data
birth = pd.read_csv('birth.csv')
edu = pd.read_csv('edu.csv')
spouse_edu = pd.read_csv('spouse_edu.csv')
marstat = pd.read_csv('marstat.csv')

In [94]:
birth.columns = ['id', 'birth_year', 'gender']
edu = edu.rename(columns={'R0000100':'id', 'T9900000': 'grade'})
birth = pd.merge(birth, edu[['id', 'grade']], on='id', how='left')

In [95]:
# Create a range of age values from 21 to 60
age_values = np.arange(21, 61)

# Repeat the rows for each age value
num_repeats = len(age_values)
sample = birth.loc[birth.index.repeat(num_repeats)].reset_index(drop=True)

# Assign the age values to the repeated DataFrame
sample['age'] = np.tile(age_values, len(birth))

In [96]:
marstat.columns = ['id'] + list(range(1979,1995)) + list(range(1996,2019,2))
# deal with negative values
marstat = marstat.replace(-4, 0)
marstat[marstat<0] = np.nan

In [97]:
# convert categorical information to dummies
# if marital status is 0 (never married), 3 (divorced), or 6 (widowed), set the dummy to 0
columns_to_replace = marstat.columns[1:]  # Exclude the first column
marstat[columns_to_replace] = np.where((marstat[columns_to_replace] == 0) |
                                       (marstat[columns_to_replace] == 3) |
                                       (marstat[columns_to_replace] == 6), 0,
                                       marstat[columns_to_replace])
marstat[columns_to_replace] = np.where((marstat[columns_to_replace] == 1) |
                                       (marstat[columns_to_replace] == 2) |
                                       (marstat[columns_to_replace] == 5), 1,
                                       marstat[columns_to_replace])

In [98]:
new_cols = list(range(1995,2018,2))
for col in new_cols:
    marstat[col] = np.nan  # Assign nan (or any other initial value) to create an empty column

In [99]:
new_column_order = ['id'] + list(range(1979,2019))
marstat = marstat[new_column_order]

In [100]:
col_interpolate = marstat.columns[1:]
marstat[col_interpolate] = marstat[col_interpolate].interpolate(method='ffill', axis=1)
marstat[col_interpolate] = marstat[col_interpolate].interpolate(method='bfill', axis=1)

In [101]:
# convert a wide dataframe to a long one
marstat_melted = pd.melt(marstat, id_vars=['id'], var_name='year', value_name='marstat')

In [102]:
spouse_edu.columns = ['id'] + list(range(1979,1995)) + list(range(1996,2019,2))
for col in new_cols:
    spouse_edu[col] = np.nan  # Assign nan (or any other initial value) to create an empty column

In [103]:
spouse_edu = spouse_edu[new_column_order]

In [104]:
# deal with negative values
spouse_edu[spouse_edu<0] = np.nan

In [105]:
col_interpolate = spouse_edu.columns[1:]
spouse_edu[col_interpolate] = spouse_edu[col_interpolate].interpolate(method='ffill', axis=1)
spouse_edu[col_interpolate] = spouse_edu[col_interpolate].interpolate(method='bfill', axis=1)

In [106]:
spouse_edu_melted = pd.melt(spouse_edu, id_vars=['id'], var_name='year', value_name='spouse_grade')

In [107]:
# add birth year to calculate age
spouse_edu_melted = pd.merge(spouse_edu_melted, birth[['id', 'birth_year']], on='id', how='left')
spouse_edu_melted['year'] = spouse_edu_melted['year'].astype(int)
spouse_edu_melted['age'] = spouse_edu_melted['year'] - 1900 - spouse_edu_melted['birth_year']

In [108]:
# add spouse grade to the sample
sample = pd.merge(sample, spouse_edu_melted[['id', 'age', 'spouse_grade']],
                  on=['id', 'age'], how='left')

In [109]:
# add birth year to calculate age
marstat_melted = pd.merge(marstat_melted, birth[['id', 'birth_year']], on='id', how='left')
marstat_melted['year'] = marstat_melted['year'].astype(int)
marstat_melted['age'] = marstat_melted['year'] - 1900 - spouse_edu_melted['birth_year']

In [110]:
# add marital status to the sample
sample = pd.merge(sample, marstat_melted[['id', 'age', 'marstat']],
                  on=['id', 'age'], how='left')

In [111]:
sample['marstat'] = sample.groupby('id')['marstat'].transform(lambda x: x.bfill())
sample['spouse_grade'] = sample.groupby('id')['spouse_grade'].transform(lambda x: x.bfill())

In [112]:
sample['spouse_grade'] = np.where(sample['marstat']==0, np.nan, sample['spouse_grade'])

In [113]:
df = sample[(sample['marstat'] == 1) & (sample['spouse_grade'].isna())]

In [114]:
# Merge df1 and df2 with indicator=True
merged = sample.merge(df['id'], on='id', how='left', indicator=True)

# Filter rows where '_merge' is 'left_only' (meaning they exist in df1 but not in df2)
sample = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')

In [115]:
df = sample[(sample['age'] == 21) & (sample['marstat'] == 1)]

In [116]:
# Merge df1 and df2 with indicator=True
merged = sample.merge(df['id'], on='id', how='left', indicator=True)

# Filter rows where '_merge' is 'left_only' (meaning they exist in df1 but not in df2)
sample = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')

In [117]:
sample['mardur'] = 0

In [118]:
sample = sample[sample['grade'] >= 0]

In [119]:
sample.describe()

Unnamed: 0,id,birth_year,gender,grade,age,spouse_grade,marstat,mardur
count,381920.0,381920.0,381920.0,381920.0,381920.0,177469.0,357524.0,381920.0
mean,6232.138877,60.437683,1.444491,13.422392,40.5,13.552389,0.496383,0.0
std,3649.654275,2.235038,0.49691,2.591834,11.543411,3.314985,0.499988,0.0
min,1.0,57.0,1.0,0.0,21.0,0.0,0.0,0.0
25%,3093.75,58.0,1.0,12.0,30.75,12.0,0.0,0.0
50%,6241.5,60.0,1.0,12.0,40.5,13.0,0.0,0.0
75%,9339.25,62.0,2.0,15.0,50.25,16.0,1.0,0.0
max,12685.0,64.0,2.0,20.0,60.0,99.0,1.0,0.0


In [120]:
# Create a new column 'edu' based on 'grade'
def assign_edu(grade):
    if grade < 12:
        return 0
    elif grade == 12:
        return 1
    elif grade < 16:
        return 2
    elif grade == 16:
        return 3
    else:
        return 4

sample['edu'] = sample['grade'].apply(assign_edu)

In [121]:
female = sample[sample['gender'] == 2]
male = sample[sample['gender'] == 1]

In [122]:
# Group by 'age' and count the occurrences of 'marstat' values
equilibrium_f = female.groupby(['age', 'edu'])['marstat'].value_counts().unstack(fill_value=0)

# Rename the columns for clarity
equilibrium_f.columns = ['single_f', 'married_f']
equilibrium_f = equilibrium_f.reset_index()

In [129]:
# Group by 'age' and count the occurrences of 'marstat' values
equilibrium_m = male.groupby(['age', 'edu'])['marstat'].value_counts().unstack(fill_value=0)

# Rename the columns for clarity
equilibrium_m.columns = ['single_m', 'married_m']
equilibrium_m = equilibrium_m.reset_index()

In [133]:
equilibrium = pd.merge(equilibrium_f, equilibrium_m, on=['age', 'edu'], how='left')

In [134]:
equilibrium

Unnamed: 0,age,edu,single_f,married_f,single_m,married_m
0,21,0,322,0,649,0
1,21,1,1547,0,2390,0
2,21,2,1124,0,1138,0
3,21,3,675,0,616,0
4,21,4,576,0,511,0
5,22,0,296,26,590,59
6,22,1,1344,203,2198,192
7,22,2,993,131,1069,69
8,22,3,638,37,593,23
9,22,4,548,28,495,16


In [135]:
sample.to_csv('sample79.csv',index=False)
equilibrium.to_csv('equilibrium.csv',index=False)