In [0]:
sc

In [0]:
path_adult = "/databricks-datasets/adult/adult.data"

field_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "label"
]

df_adult = spark.read.csv(path_adult)
df_adult = df_adult.toDF(*field_names)

In [0]:
display(df_adult)

age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [0]:
df_adult.count()

In [0]:
df_adult.drop_duplicates().count()

In [0]:
import pyspark.sql.functions as F

df_adult = df_adult.withColumn(
    "is young",
    F.when(F.trim(F.col("age")) < 35, True).otherwise(False)
)

df_adult = df_adult.withColumn(
    "fulltime_parttime",
    F.when(F.trim(F.col("hours-per-week")) > 39, "Full-time").otherwise("Part-time")
)

In [0]:
display(df_adult)

age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,is young,fulltime_parttime
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,False,Full-time
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,False,Part-time
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,False,Full-time
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,False,Full-time
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,True,Full-time
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,False,Full-time
49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,False,Part-time
52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,False,Full-time
31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,True,Full-time
42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,False,Full-time


In [0]:
df_adult_select = df_adult.select("is young", "marital-status", "fulltime_parttime")
display(df_adult_select)

is young,marital-status,fulltime_parttime
False,Never-married,Full-time
False,Married-civ-spouse,Part-time
False,Divorced,Full-time
False,Married-civ-spouse,Full-time
True,Married-civ-spouse,Full-time
False,Married-civ-spouse,Full-time
False,Married-spouse-absent,Part-time
False,Married-civ-spouse,Full-time
True,Never-married,Full-time
False,Married-civ-spouse,Full-time


# Calculation portion for people UNDER the age of 35

In [0]:
is_young_df = df_adult_select\
.filter(df_adult_select["is young"] == True)
display(is_young_df)

is young,marital-status,fulltime_parttime
True,Married-civ-spouse,Full-time
True,Never-married,Full-time
True,Married-civ-spouse,Full-time
True,Never-married,Part-time
True,Never-married,Full-time
True,Married-civ-spouse,Full-time
True,Never-married,Part-time
True,Never-married,Full-time
True,Never-married,Full-time
True,Never-married,Full-time


# Table 1 - listed below

Table 1 suggests that people before 35 y.o. are predominantly never-married or married (civ-spouse).

In [0]:
is_young_df.groupby('marital-status').count().show(truncate=False)

# Table 2

Table 2 suggests that people before 35 y.o. mostly work full-time.

In [0]:
is_young_df.groupby('fulltime_parttime').count().show(truncate=False)

# Table 3

Table 3 suggests that people before 35 y.o. work full-time more often than part-time, regerdless material-status.

In [0]:
is_young_df.groupBy('marital-status', 'fulltime_parttime').agg(F.count('marital-status')).sort('marital-status', 'fulltime_parttime').show(truncate=False)

# Calculation portion for people OVER the age of 35

In [0]:
is_not_young_df = df_adult_select\
.filter(df_adult_select["is young"] == False)
display(is_not_young_df)

is young,marital-status,fulltime_parttime
False,Never-married,Full-time
False,Married-civ-spouse,Part-time
False,Divorced,Full-time
False,Married-civ-spouse,Full-time
False,Married-civ-spouse,Full-time
False,Married-spouse-absent,Part-time
False,Married-civ-spouse,Full-time
False,Married-civ-spouse,Full-time
False,Married-civ-spouse,Full-time
False,Married-civ-spouse,Full-time


# Table 4 - listed below

Table 4 suggests that people after 35 y.o. are predominantly married (civ-spouse) or divorced.

In [0]:
is_not_young_df.groupby('marital-status').count().show(truncate=False)

# Table 5

Table 5 suggests that people after 35 y.o. mostly work full-time.

In [0]:
is_not_young_df.groupby('fulltime_parttime').count().show(truncate=False)

# Table 6

Table 6 suggests that people after 35 y.o. work full-time more often than part-time, regerdless material-status, however this is not true for widows and widowers, as they work just as much part-time as full-time.

In [0]:
is_not_young_df.groupBy('marital-status', 'fulltime_parttime').agg(F.count('marital-status')).sort('marital-status', 'fulltime_parttime').show(truncate=False)

# Conclusions

From the above tables we can state that in general most people prefer to work full-time, regardless age and material-status, with the exception of people over the age of 35, who have been widowed. The latter are split nearly 50/50 (delta=+/-2 people).

Also, people over the age of 35 seem to be more likely to have married, and in consequence more likely to have been separated, divorced, or widowed. This may imply that the longer you live, the more likely it is you will run into relationship trouble or worse.