I use Python and Spark to work on this dataset.
Link to the dataset: https://www.kaggle.com/datasets/humairmunir/lung-cancer-risk-dataset?resource=download

---------------------------------------------

- Initialize the SparkContext

In [None]:
import pyspark

from pyspark import SparkContext
sc = SparkContext(master='local')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Lung Cancer Prediction") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()


- Load and read dataset

In [None]:
df = spark.read.csv('lungcancer.csv', header=True, inferSchema=True)
df.show(5)

- We will use LUNG_CANCER column as target variable in model prediction.
- The remain columns will be considered as features in model prediction.

- Feature Engineering: In this section, I go through these tasks.
    - Define numerical/categorical columns. Since label column is also a categorical columns, we include it in the categorical columns to compute the next task.
    - StringIndexing all categorical columns
    - Remove indexed label columns to prepare for OneHotEncoding task.
    - OneHotEncoding all indexed columns.
    - Remove unnecessary columns and Re-arrange the dataframe
    - VectorAsembling to map all features into FEATURES column


In [None]:
# Define numerical/categorical columns
numerical_cols = 'AGE'
categorical_cols = df.columns
categorical_cols.remove('AGE') # remove numerical column

print("Numerical column: " + numerical_cols)
print("Categorical columns: ")
print(*categorical_cols, sep=', ')

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

In [None]:
# StringIndexing
indexers = [StringIndexer(inputCol=column, outputCol="indexed_"+column) for column in categorical_cols]
pipeline = Pipeline(stages=indexers)
# fit and transform the pipeline
indexed_df = pipeline.fit(df).transform(df)
indexed_df.show(5)

In [None]:
# remove target variable 
categorical_cols.remove('LUNG_CANCER')

In [None]:
# OneHotEncoding
encoders = [OneHotEncoder(inputCol="indexed_"+column, outputCol="encoded_"+column) for column in categorical_cols]
pipeline = Pipeline(stages=encoders)

# fit and transform the pipeline
encoded_df = pipeline.fit(indexed_df).transform(indexed_df)
encoded_df.show(5)

In [None]:
# Find indexed columns 
import re

# Define the pattern 
pattern = re.compile(r'^indexed_')

# Subtract the set of columns that match the pattern
indexed_columns = [col for col in indexed_df.columns if pattern.match(col)]
indexed_columns.remove('indexed_LUNG_CANCER')

- Re-structure dataframe

In [None]:
# Drop unnecessary columns
encoded_df = encoded_df.drop(*categorical_cols).drop(*indexed_columns).drop('LUNG_CANCER')

# Re-arrange dataframe
encoded_df.select('AGE',
 'encoded_GENDER',
 'encoded_SMOKING',
 'encoded_YELLOW_FINGERS',
 'encoded_ANXIETY',
 'encoded_PEER_PRESSURE',
 'encoded_CHRONIC_DISEASE',
 'encoded_FATIGUE',
 'encoded_ALLERGY',
 'encoded_WHEEZING',
 'encoded_ALCOHOL_CONSUMING',
 'encoded_COUGHING',
 'encoded_SHORTNESS_OF_BREATH',
 'encoded_SWALLOWING_DIFFICULTY',
 'encoded_CHEST_PAIN',
 'indexed_LUNG_CANCER').show(5)

In [None]:
# Find indexed columns 
import re

# Define the pattern 
pattern = re.compile(r'^encoded_')

# Subtract the set of columns that match the pattern
encoded_columns = [col for col in encoded_df.columns if pattern.match(col)]

In [None]:
# Vector Ensembling
# features
features = []
features.append('AGE')
for i in encoded_columns:
    features.append(i)

assembler = VectorAssembler(inputCols= features, outputCol="FEATURES")
df2 = assembler.transform(encoded_df)
df2 = df2.withColumnRenamed('indexed_LUNG_CANCER', 'LABEL').select('FEATURES','LABEL')
df2.show(5)

- Train Split test

In [None]:
train_data, test_data = df2.randomSplit([0.8, 0.2], seed=100)

- Initialize models

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, GBTClassifier

dt = DecisionTreeClassifier(featuresCol='FEATURES', labelCol='LABEL')
rf = RandomForestClassifier(featuresCol='FEATURES', labelCol='LABEL')
lr = LogisticRegression(featuresCol='FEATURES', labelCol='LABEL')
gbt = GBTClassifier(featuresCol='FEATURES', labelCol='LABEL', maxIter=100)

- Initialize the evaluator

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="LABEL", predictionCol="prediction", metricName="accuracy"
)


- Param grids

In [None]:
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [2, 3, 4, 5]) \
    .build()

rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

gbt_param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 5, 10]) \
    .build()


- Decision Tree Cross-Validation with Accuracy Score

In [None]:
dt_cv = CrossValidator(estimator=dt, estimatorParamMaps=dt_param_grid, evaluator=evaluator, numFolds=6)
dt_cv_model = dt_cv.fit(train_data)
dt_predictions = dt_cv_model.transform(test_data)
dt_accuracy = evaluator.evaluate(dt_predictions)
print(f"Decision Tree Test Accuracy: {dt_accuracy}")

- Random Forest Cross-Validation with Accuracy Score

In [None]:
rf_cv = CrossValidator(estimator=rf, estimatorParamMaps=rf_param_grid, evaluator=evaluator, numFolds=6)
rf_cv_model = rf_cv.fit(train_data)
rf_predictions = rf_cv_model.transform(test_data)
rf_accuracy = evaluator.evaluate(rf_predictions)
print(f"Random Forest Test Accuracy: {rf_accuracy}")

- Logistic Regression Cross-Validation with Accuracy Score

In [None]:
lr_cv = CrossValidator(estimator=lr, estimatorParamMaps=lr_param_grid, evaluator=evaluator, numFolds=6)
lr_cv_model = lr_cv.fit(train_data)
lr_predictions = lr_cv_model.transform(test_data)
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Test Accuracy: {lr_accuracy}")

- Gradient-Boosted Trees Cross-Validation with Accuracy Score

In [None]:
gbt_cv = CrossValidator(estimator=gbt, estimatorParamMaps=gbt_param_grid, evaluator=evaluator, numFolds=6)
gbt_cv_model = gbt_cv.fit(train_data)
gbt_predictions = gbt_cv_model.transform(test_data)
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"GBT Test Accuracy: {gbt_accuracy}")