# SparkML Models Notebook

Responsibility: Feras Elkharrat - 7000454

## Imports

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Spark Session Initialization

In [None]:
spark = SparkSession.builder \
    .appName("ProjectSuccessPrediction") \
    .config("spark.executor.memory", "24g") \
    .config("spark.driver.memory", "24g") \
    .getOrCreate()

## Loading the Data Using PySpark and General Preparation

In [None]:
# Load data
data = spark.read.csv("/kick_starter_cleaned.csv", header=True, inferSchema=True)

# Drop useless columns
columns_to_drop = ["category", "currency", "goal", "deadline", "launched", "pledged", "backers"]
data = data.drop(*columns_to_drop)

# Examine and transform the 'state' column to a binary label column
data = data.withColumn("label", when(col("state") == "successful", 1).otherwise(0))

# Drop rows where the state is 'live' as per the hint in the PDF
data = data.filter(col("state") != "live")

# Drop the 'state' column as it's now redundant
data = data.drop("state")

## Grouping Categorical Columns and Encoding 

In [None]:
# Columns to index and encode
categorical_columns = ["name", "main_category", "country"]

# Indexing and encoding pipelines
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed") for col in categorical_columns]
encoders = [OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"_encoded") for col in categorical_columns]

# Apply the indexers and encoders
pipeline = Pipeline(stages=indexers + encoders)
data = pipeline.fit(data).transform(data)

# Drop the original and indexed columns, keeping only encoded columns
data = data.drop(*categorical_columns)
for col in categorical_columns:
    data = data.drop(col+"_indexed")

## Fixing Numerical Columns (Conversion to Double)

In [None]:
data = data.withColumn("usd_pledged_real", data["usd_pledged_real"].cast(DoubleType()))
data = data.withColumn("usd_goal_real", data["usd_goal_real"].cast(DoubleType()))

## Dropping Null Values

In [None]:
data = data.dropna()

## Defining Feature Columns and Assembling Vector from Label + Features

In [None]:
feature_columns = [col for col in data.columns if col != "label"]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

## Selecting the Relevant Columns for Modeling

In [None]:
data = data.select("features", "label")

## Splitting Data into Training and Testing Sets

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

## Model 1: Logistic Regression

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

## Model 2: Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)

## Model 3: Random Forest Classifier

In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label")
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

## Initialising an Evaluator and Setting Accuracy as the Main Evaluation Metric

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

## Calculating Accuracies for the 3 Models Above

In [None]:
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")

dt_accuracy = evaluator.evaluate(dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")

rf_accuracy = evaluator.evaluate(rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")

# As we can see, the highest accuracy yield was through the decision tree model followed by the logistic regression and random forest models which achieved 63.8% and 64.4% respectively.