In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import count
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, FloatType, DoubleType, LongType
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd

In [3]:
spark = SparkSession.builder.appName("Predictive Analytics with Spark MLlib").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/28 13:41:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Data loading

In [4]:
df = spark.read.format("csv").load("adult.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [6]:
df.show()

+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|age|       workclass|fnlwgt|   education|educational-num|    marital-status|       occupation| relationship|              race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
| 25|         Private|226802|        11th|              7|     Never-married|Machine-op-inspct|    Own-child|             Black|  Male|           0|           0|            40| United-States| <=50K|
| 38|         Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|      Husband|             White|  Male|           0|           0|            50| United-States| <=50K|
| 28|

In [7]:
# Set an id column to dataframe
df = df.withColumn('id', monotonically_increasing_id())

df = df[['id'] + df.columns[:-1]]

df.show(3)

+---+---+---------+------+----------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
| id|age|workclass|fnlwgt| education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+----------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|  0| 25|  Private|226802|      11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|
|  1| 38|  Private| 89814|   HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Male|           0|           0|            50| United-States| <=50K|
|  2| 28|Local-gov|336951|Assoc-acdm|             12|Married-civ-spouse|  Protective-serv|     Husba

In [8]:
df.count()

48842

# Data cleaning

In [9]:
# Check for null values in each column
df.select([col(c).isNull().cast("int").alias(c) for c in df.columns]).show()

+---+---+---------+------+---------+---------------+--------------+----------+------------+----+------+------------+------------+--------------+--------------+------+
| id|age|workclass|fnlwgt|education|educational-num|marital-status|occupation|relationship|race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+---------+---------------+--------------+----------+------------+----+------+------------+------------+--------------+--------------+------+
|  0|  0|        0|     0|        0|              0|             0|         0|           0|   0|     0|           0|           0|             0|             0|     0|
|  0|  0|        0|     0|        0|              0|             0|         0|           0|   0|     0|           0|           0|             0|             0|     0|
|  0|  0|        0|     0|        0|              0|             0|         0|           0|   0|     0|           0|           0|             0|             0|     0

In [10]:
# Check unique values for non-numeric columns
for column in df.columns:
    column_type = df.schema[column].dataType
    if not isinstance(column_type, (IntegerType, FloatType, DoubleType, LongType)):
        print(f'{column}:')
        df.select(column).distinct().show()
        print('-' * 50)

workclass:
+----------------+
|       workclass|
+----------------+
|Self-emp-not-inc|
|       Local-gov|
|       State-gov|
|         Private|
|     Without-pay|
|     Federal-gov|
|    Never-worked|
|               ?|
|    Self-emp-inc|
+----------------+

--------------------------------------------------
education:
+------------+
|   education|
+------------+
|        10th|
|     Masters|
|     5th-6th|
|  Assoc-acdm|
|   Assoc-voc|
|     7th-8th|
|         9th|
|     HS-grad|
|   Bachelors|
|        11th|
|     1st-4th|
|   Preschool|
|        12th|
|   Doctorate|
|Some-college|
| Prof-school|
+------------+

--------------------------------------------------
marital-status:
+--------------------+
|      marital-status|
+--------------------+
|           Separated|
|       Never-married|
|Married-spouse-ab...|
|            Divorced|
|             Widowed|
|   Married-AF-spouse|
|  Married-civ-spouse|
+--------------------+

--------------------------------------------------
occupa

In [11]:
# Check for invalid values represented by '?' in each column
for column in df.columns:
    invalid_values_df = df.filter(df[column] == '?')
    invalid_values_count = invalid_values_df.count()
    if invalid_values_count > 0:
        print(f"Column '{column}' has {invalid_values_count} invalid values.")
        invalid_values_df.show(5) # Show a few examples of invalid values

# Alternatively, to check if there are any invalid values across the entire dataset
invalid_values_count = df.select(
    [F.when(F.col(c) == '?', 1).otherwise(0).alias(c) for c in df.columns]
).agg(*[F.sum(c).alias(c) for c in df.columns])

invalid_values_count.show()

Column 'workclass' has 2799 invalid values.
+---+---+---------+------+------------+---------------+------------------+----------+-------------+-----+------+------------+------------+--------------+--------------+------+
| id|age|workclass|fnlwgt|   education|educational-num|    marital-status|occupation| relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+---------+------+------------+---------------+------------------+----------+-------------+-----+------+------------+------------+--------------+--------------+------+
|  4| 18|        ?|103497|Some-college|             10|     Never-married|         ?|    Own-child|White|Female|           0|           0|            30| United-States| <=50K|
|  6| 29|        ?|227026|     HS-grad|              9|     Never-married|         ?|    Unmarried|Black|  Male|           0|           0|            40| United-States| <=50K|
| 13| 58|        ?|299831|     HS-grad|              9|Married-civ-spouse|  

In [12]:
# 'workclass', 'occupation', and 'native-country' are the columns with potential invalid values
invalid_columns = ['workclass', 'occupation', 'native-country']

# Create a condition to check for '?' in any of the specified columns
condition = F.lit(False)  # Initialize with False
for column in invalid_columns:
    condition = condition | (F.col(column) == '?')  # Add condition for each column

# Filter the DataFrame to exclude rows matching the condition (rows with '?' in any of the columns)
df = df.filter(~condition)

# Show the cleaned DataFrame
df.show()

+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| id|age|       workclass|fnlwgt|   education|educational-num|    marital-status|       occupation| relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---+----------------+------+------------+---------------+------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|  0| 25|         Private|226802|        11th|              7|     Never-married|Machine-op-inspct|    Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|
|  1| 38|         Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|      Husband|White|  Male|           0|           0|            50| United-States| <=50K|
|  2| 28|       Local-gov|336951|  Assoc-acdm|    

In [13]:
df.count()

45222

# Feature engineering

In [14]:
#  Replacing education categories into new 4 categories
df = df.withColumn(
    'education',
    F.when(df['education'].isin(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th']), 'School')
    .when(df['education'].isin(['HS-grad', 'Some-college']), 'High School')
    .when(df['education'].isin(['Assoc-voc', 'Assoc-acdm']), 'Associates')
    .when(df['education'].isin(['Prof-school', 'Bachelors', 'Masters', 'Doctorate']), 'Higher Education')
    .otherwise(df['education'])
)

In [15]:
# Replacing marital-status categories into 2 new categories
df = df.withColumn(
    'marital-status',
    F.when(df['marital-status'].isin(['Married-civ-spouse', 'Married-AF-spouse', 'Married-spouse-absent']), 'Married')
    .when(df['marital-status'].isin(['Never-married', 'Divorced', 'Separated', 'Widowed']), 'Single')
    .otherwise(df['marital-status'])
)

In [16]:
# Create 'capital_income' (capital-gain - capital-loss) column
df = df.withColumn('capital_income', df['capital-gain'] - df['capital-loss'])

In [17]:
# Drop the 'educational-num' and 'fnlwgt' columns because they don't give any value
df = df.drop('educational-num', 'fnlwgt')

In [18]:
#  Convert the 'income' column to numerical values (0 for <=50K, 1 for >50K)
df = df.withColumn('income', F.when(df['income'] == '<=50K', 0).otherwise(1))

In [19]:
# Show the modified DataFrame
df.show(5)

+---+---+---------+-----------+--------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------+
| id|age|workclass|  education|marital-status|       occupation| relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|capital_income|
+---+---+---------+-----------+--------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------+
|  0| 25|  Private|     School|        Single|Machine-op-inspct|    Own-child|Black|  Male|           0|           0|            40| United-States|     0|             0|
|  1| 38|  Private|High School|       Married|  Farming-fishing|      Husband|White|  Male|           0|           0|            50| United-States|     0|             0|
|  2| 28|Local-gov| Associates|       Married|  Protective-serv|      Husband|White|  Male|           0|           0|            40| United-States|   

In [20]:
df.dtypes

[('id', 'bigint'),
 ('age', 'int'),
 ('workclass', 'string'),
 ('education', 'string'),
 ('marital-status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('gender', 'string'),
 ('capital-gain', 'int'),
 ('capital-loss', 'int'),
 ('hours-per-week', 'int'),
 ('native-country', 'string'),
 ('income', 'int'),
 ('capital_income', 'int')]

# Data transformation

In [21]:
df.show()

+---+---+----------------+----------------+--------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------+
| id|age|       workclass|       education|marital-status|       occupation| relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|capital_income|
+---+---+----------------+----------------+--------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------+
|  0| 25|         Private|          School|        Single|Machine-op-inspct|    Own-child|Black|  Male|           0|           0|            40| United-States|     0|             0|
|  1| 38|         Private|     High School|       Married|  Farming-fishing|      Husband|White|  Male|           0|           0|            50| United-States|     0|             0|
|  2| 28|       Local-gov|      Associates|       Married|  Protective-serv|      Husband|

In [22]:
# Divide all columns into categorical and numerical for ease
numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week', 'capital_income']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

In [23]:
# Put all the numerical column values into a one vector
numerical_vector_assembler = VectorAssembler(inputCols=numerical_features, outputCol='numerical_features_vector')
df = numerical_vector_assembler.transform(df)

In [24]:
df.show(3)

+---+---+---------+-----------+--------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+--------------+-------------------------+
| id|age|workclass|  education|marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|capital_income|numerical_features_vector|
+---+---+---------+-----------+--------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+--------------+-------------------------+
|  0| 25|  Private|     School|        Single|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States|     0|             0|     (5,[0,3],[25.0,40...|
|  1| 38|  Private|High School|       Married|  Farming-fishing|     Husband|White|  Male|           0|           0|            50| United-States|     0|             0|     (5,[0,3],[38.0,50...|
|  2| 28|Local-gov| Assoc

In [25]:
df.select('numerical_features_vector').take(5)

[Row(numerical_features_vector=SparseVector(5, {0: 25.0, 3: 40.0})),
 Row(numerical_features_vector=SparseVector(5, {0: 38.0, 3: 50.0})),
 Row(numerical_features_vector=SparseVector(5, {0: 28.0, 3: 40.0})),
 Row(numerical_features_vector=DenseVector([44.0, 7688.0, 0.0, 40.0, 7688.0])),
 Row(numerical_features_vector=SparseVector(5, {0: 34.0, 3: 30.0}))]

In [26]:
# Scaling numerical features
scaler = StandardScaler(inputCol='numerical_features_vector',
                        outputCol='scaled_numerical_feature_vector',withStd=True,withMean=True)
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [27]:
df.select('scaled_numerical_feature_vector').take(5)

[Row(scaled_numerical_feature_vector=DenseVector([-1.025, -0.1467, -0.2188, -0.0781, -0.1345])),
 Row(scaled_numerical_feature_vector=DenseVector([-0.0415, -0.1467, -0.2188, 0.7547, -0.1345])),
 Row(scaled_numerical_feature_vector=DenseVector([-0.798, -0.1467, -0.2188, -0.0781, -0.1345])),
 Row(scaled_numerical_feature_vector=DenseVector([0.4125, 0.8775, -0.2188, -0.0781, 0.8864])),
 Row(scaled_numerical_feature_vector=DenseVector([-0.3441, -0.1467, -0.2188, -0.9109, -0.1345]))]

In [28]:
 # Encoding categorical columns
indexer = StringIndexer(inputCols=categorical_features,
                        outputCols=[f'{col}_indexed' for col in categorical_features])
df = indexer.fit(df).transform(df)
df.show()

+---+---+----------------+----------------+--------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------+-------------------------+-------------------------------+-----------------+-----------------+----------------------+------------------+--------------------+------------+--------------+----------------------+
| id|age|       workclass|       education|marital-status|       occupation| relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|capital_income|numerical_features_vector|scaled_numerical_feature_vector|workclass_indexed|education_indexed|marital-status_indexed|occupation_indexed|relationship_indexed|race_indexed|gender_indexed|native-country_indexed|
+---+---+----------------+----------------+--------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+--------------+-------------------------+-------------

In [29]:
# After encoding categorical columns, add those encoded column values to one last final feature vector
feature_assembler = VectorAssembler(inputCols=['workclass_indexed','education_indexed','marital-status_indexed','occupation_indexed','relationship_indexed','race_indexed','gender_indexed','native-country_indexed','scaled_numerical_feature_vector'],
                                    outputCol='final_feature_vector')
df = feature_assembler.transform(df)

In [30]:
df.select('final_feature_vector').take(5)

[Row(final_feature_vector=DenseVector([0.0, 2.0, 0.0, 6.0, 2.0, 1.0, 0.0, 0.0, -1.025, -0.1467, -0.2188, -0.0781, -0.1345])),
 Row(final_feature_vector=SparseVector(13, {2: 1.0, 3: 9.0, 8: -0.0415, 9: -0.1467, 10: -0.2188, 11: 0.7547, 12: -0.1345})),
 Row(final_feature_vector=DenseVector([2.0, 3.0, 1.0, 11.0, 0.0, 0.0, 0.0, 0.0, -0.798, -0.1467, -0.2188, -0.0781, -0.1345])),
 Row(final_feature_vector=DenseVector([0.0, 0.0, 1.0, 6.0, 0.0, 1.0, 0.0, 0.0, 0.4125, 0.8775, -0.2188, -0.0781, 0.8864])),
 Row(final_feature_vector=DenseVector([0.0, 2.0, 0.0, 5.0, 1.0, 0.0, 0.0, 0.0, -0.3441, -0.1467, -0.2188, -0.9109, -0.1345]))]

In [31]:
# We need only final feature vector and output column
finalized_data = df.select('final_feature_vector','income')

In [32]:
finalized_data.show()

+--------------------+------+
|final_feature_vector|income|
+--------------------+------+
|[0.0,2.0,0.0,6.0,...|     0|
|(13,[2,3,8,9,10,1...|     0|
|[2.0,3.0,1.0,11.0...|     1|
|[0.0,0.0,1.0,6.0,...|     1|
|[0.0,2.0,0.0,5.0,...|     0|
|[1.0,1.0,1.0,1.0,...|     1|
|[0.0,0.0,0.0,5.0,...|     0|
|(13,[1,2,8,9,10,1...|     0|
|(13,[2,3,8,9,10,1...|     1|
|[5.0,1.0,1.0,3.0,...|     0|
|[0.0,0.0,0.0,3.0,...|     0|
|(13,[2,3,8,9,10,1...|     1|
|[0.0,1.0,1.0,2.0,...|     1|
|[3.0,0.0,0.0,5.0,...|     0|
|[0.0,0.0,1.0,3.0,...|     0|
|[0.0,0.0,0.0,6.0,...|     0|
|[0.0,1.0,1.0,10.0...|     1|
|[0.0,0.0,0.0,5.0,...|     0|
|[0.0,1.0,0.0,1.0,...|     0|
|[0.0,1.0,1.0,1.0,...|     0|
+--------------------+------+
only showing top 20 rows



# Model training

In [33]:
# train and test split
train, test = finalized_data.randomSplit([0.8, 0.2],seed=42)

In [34]:
print(f"Train set count: {train.count()}")
print(f"Test set count: {test.count()}")

Train set count: 36229
Test set count: 8993


## 1st model - Random forest

In [35]:
# Define the Random Forest model
rf = RandomForestClassifier(featuresCol='final_feature_vector', labelCol='income', maxBins=45)

# Train the Random Forest model
rf_model = rf.fit(train)

# Make predictions
rf_predictions = rf_model.transform(test)

In [36]:
# Evaluate the model
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName="accuracy")
f1_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='f1')
rf_accuracy = accuracy_evaluator.evaluate(rf_predictions)
rf_f1_score = f1_evaluator.evaluate(rf_predictions)

print(f'Random Forest Model Accuracy: {rf_accuracy}')
print(f'Random Forest Model F1 Score: {rf_f1_score}')

Random Forest Model Accuracy: 0.8449905482041588
Random Forest Model F1 Score: 0.8316527173567849


In [37]:
# To view predictions
rf_predictions.show()

+--------------------+------+--------------------+--------------------+----------+
|final_feature_vector|income|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(13,[0,2,8,9,10,1...|     1|[14.6175011604774...|[0.73087505802386...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[14.1748248766682...|[0.70874124383341...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[14.1748248766682...|[0.70874124383341...|       0.0|
|(13,[0,2,8,9,10,1...|     1|[14.4662660484259...|[0.72331330242129...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[14.4662660484259...|[0.72331330242129...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[13.9162029566263...|[0.69581014783131...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[13.7371595649231...|[0.68685797824615...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[13.7371595649231...|[0.68685797824615...|       0.0|
|(13,[0,2,8,9,10,1...|     0|[14.1743309810954...|[0.70871654905477...|       0.0|
|(13

## 2nd model - Logistic regression

In [38]:
# Define the Logistic Regression model
lr = LogisticRegression(featuresCol='final_feature_vector', labelCol='income')

# Train the Logistic Regression model
lr_model = lr.fit(train)

# Make predictions
lr_predictions = lr_model.transform(test)

25/03/28 13:42:15 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [39]:
lr_model

LogisticRegressionModel: uid=LogisticRegression_443baeae0cee, numClasses=2, numFeatures=13

In [40]:
# Evaluate the model
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='accuracy')
f1_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='f1')
lr_accuracy = accuracy_evaluator.evaluate(lr_predictions)
lr_f1_score = f1_evaluator.evaluate(lr_predictions)

print(f'Logistic regression Model Accuracy: {lr_accuracy}')
print(f'Logistic regression Model F1 Score: {lr_f1_score}')

Logistic regression Model Accuracy: 0.8143000111197598
Logistic regression Model F1 Score: 0.7970916663241656


## 3rd model - Gradient boosting model

In [41]:
# Define GBTClassifier model
gbt_classifier = GBTClassifier(
    featuresCol="final_feature_vector",  # The column containing your features
    labelCol="income",  # The column containing your target variable
    maxDepth=5, # Maximum depth of each individual tree
    maxIter=20, # Number of trees need to be created
    stepSize=0.1, # Learning Rate
    maxBins=45 # More bins, more detailed feature splitting
)

# Train the Gradient Boosting model
gbt_model = gbt_classifier.fit(train)

# Make predictions on the test set
gbt_predictions = gbt_model.transform(test)

In [42]:
gbt_model

GBTClassificationModel: uid = GBTClassifier_6d3959270c8a, numTrees=20, numClasses=2, numFeatures=13

In [43]:
# Evaluate the model
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='accuracy')
f1_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='f1')
gbt_accuracy = accuracy_evaluator.evaluate(gbt_predictions)
gbt_f1_score = f1_evaluator.evaluate(gbt_predictions)

print(f'Gradient boosting model Accuracy: {gbt_accuracy}')
print(f'Gradient boosting model F1 Score: {gbt_f1_score}')

Gradient boosting model Accuracy: 0.8607806071388858
Gradient boosting model F1 Score: 0.8540239408733326


## 4th model - Linear support vector machine

In [44]:
# Define the SVM model using LinearSVC
svm = LinearSVC(featuresCol='final_feature_vector', labelCol='income')

# Train the SVM model
svm_model = svm.fit(train)

# Make predictions on the test set
svm_predictions = svm_model.transform(test)

In [45]:
svm_model

LinearSVCModel: uid=LinearSVC_6e4cd52666a4, numClasses=2, numFeatures=13

In [46]:
# Evaluate the model
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='accuracy')
f1_evaluator = MulticlassClassificationEvaluator(labelCol='income',metricName='f1')
svm_accuracy = accuracy_evaluator.evaluate(svm_predictions)
svm_f1_score = f1_evaluator.evaluate(svm_predictions)

print(f'Linear support vector machine model Accuracy: {svm_accuracy}')
print(f'Linear support vector machine F1 Score: {svm_f1_score}')

Linear support vector machine model Accuracy: 0.8036250416990993
Linear support vector machine F1 Score: 0.7681654769276633


# Compare all 4 models

In [47]:
# Collect metrics for all models
results = [
    ("Random Forest", rf_accuracy, rf_f1_score),
    ("Logistic Regression", lr_accuracy, lr_f1_score),
    ("Gradient Boosting", gbt_accuracy, gbt_f1_score),
    ("Linear SVM", svm_accuracy, svm_f1_score)
]

# Create a pandas DataFrame for visualization
metrics_df = pd.DataFrame(results, columns=["Model", "Accuracy", "F1 Score"])

# Format for better readability
metrics_df["Accuracy"] = metrics_df["Accuracy"].map("{:.2%}".format)
metrics_df["F1 Score"] = metrics_df["F1 Score"].map("{:.2%}".format)

# Display
print("\nModel Performance Comparison:")
print(metrics_df.to_string(index=False))


Model Performance Comparison:
              Model Accuracy F1 Score
      Random Forest   84.50%   83.17%
Logistic Regression   81.43%   79.71%
  Gradient Boosting   86.08%   85.40%
         Linear SVM   80.36%   76.82%
