In [1]:
# Import dependencies
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from google.colab import drive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.1'
spark_version = 'spark-3.4.4'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 12.7 kB/129 kB 10%] [Connected to cloud.r-project                                                                                                    Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [1 InRelease 110 kB/129 kB 85%] [2 InRelease 3,626 B/3,626 B 10% [Connecting to archive.ubuntu.com] [1 InRelease 129 kB/129 kB 100%] [Connected to r2u.stat.illino0% [Connecting to archive.ubuntu.com (91.189.91.82)] [Connected to r2u.stat.illinois.edu (192.17.190                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpadcon                                                 

In [4]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrame Basics").getOrCreate()

In [5]:
# Read csv into Spark dataframe
cleaned_df = spark.read.csv("/content/drive/My Drive/Project_4/cleaned_df.csv", header=True, inferSchema=True)
cleaned_df.show()

+---+---+-----------------+------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|_c0|age|        workclass|fnlwgt|education-num|      marital-status|        occupation|  relationship|               race|    sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+-----------------+------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|  0| 39|        State-gov| 77516|           13|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|        2174|           0|            40| United-States|     0|
|  1| 50| Self-emp-not-inc| 83311|           13|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|           0|           0|            13| United-States|     0|
|  2| 38|          Privat

In [6]:
# Convert to pandas
cleaned_df_pd = cleaned_df.toPandas()
cleaned_df_pd = cleaned_df_pd.drop(columns=['_c0'])
cleaned_df_pd.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [7]:
# Strip leading/trailing spaces and convert to lowercase for proper comparison
cleaned_df_pd['native-country'] = cleaned_df_pd['native-country'].str.strip()

# Bin 'native-country' column into 'United States' and 'Not United States'
cleaned_df_pd['native-country-binned'] = cleaned_df_pd['native-country'].apply(
    lambda x: 'United States' if x == 'United-States' else 'Not United States'
)

# Check the transformed 'native-country-binned' column
print(cleaned_df_pd['native-country-binned'].value_counts())

native-country-binned
United States        29170
Not United States     3391
Name: count, dtype: int64


In [8]:
cleaned_df_pd.drop(columns=['native-country'], inplace=True)
cleaned_df_pd.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income,native-country-binned
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,0,United States
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,0,United States
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,0,United States
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,0,United States
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,0,Not United States


In [9]:
# Convert categorical columns to numerical
categorical_columns = ['workclass', 'marital-status', 'occupation', 'race', 'relationship', 'sex', 'native-country-binned']

# Use get dummies
df_encoded = pd.get_dummies(cleaned_df_pd, columns=categorical_columns, drop_first=True)
df_encoded.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,race_ Black,race_ Other,race_ White,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male,native-country-binned_United States
0,39,77516,13,2174,0,40,0,False,False,False,...,False,False,True,True,False,False,False,False,True,True
1,50,83311,13,0,0,13,0,False,False,False,...,False,False,True,False,False,False,False,False,True,True
2,38,215646,9,0,0,40,0,False,False,False,...,False,False,True,True,False,False,False,False,True,True
3,53,234721,7,0,0,40,0,False,False,False,...,True,False,False,False,False,False,False,False,True,True
4,28,338409,13,0,0,40,0,False,False,False,...,True,False,False,False,False,False,False,True,False,False


In [10]:
# Split our preprocessed data into our features and target arrays
X = df_encoded.drop(columns=['income'])
y = df_encoded['income']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=160, activation='relu', input_dim=X_train_scaled.shape[1]))
nn.add(Dropout(0.4))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=100, activation='relu'))
nn.add(Dropout(0.4))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
from tensorflow.keras.optimizers import Adam

# Try a lower learning rate or use the default optimizer settings
optimizer = Adam(learning_rate=0.001)

In [15]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 100)

Epoch 1/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8017 - loss: 0.4075
Epoch 2/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8488 - loss: 0.3315
Epoch 3/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8517 - loss: 0.3211
Epoch 4/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8532 - loss: 0.3113
Epoch 5/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8557 - loss: 0.3125
Epoch 6/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8556 - loss: 0.3177
Epoch 7/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8574 - loss: 0.3094
Epoch 8/100
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8539 - loss: 0.3107
Epoch 9/100
[1m814/814[0m [32

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

204/204 - 0s - 2ms/step - accuracy: 0.8561 - loss: 0.3161
Loss: 0.3160630464553833, Accuracy: 0.8561338782310486
