In [None]:
 !pip install pyspark

In [23]:
from pyspark.sql import SparkSession

In [24]:
spark = SparkSession.builder \
    .appName("PySpark in Google Colab") \
    .getOrCreate()

In [25]:
from google.colab import files

In [None]:
# Prompt the user to upload a file
uploaded = files.upload()

In [None]:
# Print the uploaded file name
for filename in uploaded.keys():
    print(f"Uploaded file: {filename}")

In [52]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("CarInsurancePrediction") \
    .getOrCreate()

# Load the car insurance dataset into a DataFrame
final_numeric_car_df = spark.read.csv("final_numeric_car_df.csv", header=True, inferSchema=True)


In [None]:
import time
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("CarInsurancePrediction") \
    .getOrCreate()

# Load the car insurance dataset
final_numeric_car_df = spark.read.csv("final_numeric_car_df.csv", header=True, inferSchema=True)

# Set seed for reproducibility
random_seed = 1999

# Calculate the number of rows in the car insurance dataset
n_rows = final_numeric_car_df.count()

# Sample 70% of indices for training
training_idx = final_numeric_car_df.sample(False, 0.7, seed=random_seed)

# Create training and test datasets
training_car_insurance = final_numeric_car_df.subtract(training_idx)
test_car_insurance = training_idx

# Define the features vector
feature_cols = ['Kids_Drive', 'Age', 'Kids_Home', 'Year_at_Job', 'Income', 'Single_Parent', 'Home_Val', 'Marital_Status', 'Gender', 'Education', 'Occupation', 'Travel_Time', 'Car_Use', 'Car_Value', 'Time_in_Force', 'Car_Type', 'Old_Claim', 'Claim_Freq', 'License_Revoked', 'Vehicle_Record_Points', 'Car_Age', 'Urbanicity']

# Create the VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Transform the DataFrame to include the features vector
training_car_insurance = assembler.transform(training_car_insurance)
test_car_insurance = assembler.transform(test_car_insurance)

# Rename the label column to 'label'
training_car_insurance = training_car_insurance.withColumnRenamed("Claim_Flag", "label")

# Define the layers for the neural network
input_layer_size = len(feature_cols)
output_layer_size = 2  # Assuming binary classification
layers = [input_layer_size, 10, output_layer_size]  # Example: input layer size, hidden layer size, output layer size

# Create the MultilayerPerceptronClassifier model
nn_model = MultilayerPerceptronClassifier(layers=layers, seed=1234)

# Train the model and measure training time
start_time = time.time()
trained_model = nn_model.fit(training_car_insurance)
training_time = time.time() - start_time
print("Training Time:", training_time)

# Make predictions on the test dataset and measure prediction time
start_prediction_time = time.time()
predictions = trained_model.transform(test_car_insurance)
prediction_time = time.time() - start_prediction_time
print("Prediction Time:", prediction_time)

# Compute Confusion Matrix
# Compute Accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="Claim_Flag", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_acc.evaluate(predictions)
print("Accuracy:", accuracy)





# Compute AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="Claim_Flag", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)
print("AUC:", auc)


# Compute Sensitivity and Specificity
TP = predictions.filter("prediction = 1 AND Claim_Flag = 1").count()
FP = predictions.filter("prediction = 1 AND Claim_Flag = 0").count()
TN = predictions.filter("prediction = 0 AND Claim_Flag = 0").count()
FN = predictions.filter("prediction = 0 AND Claim_Flag = 1").count()

# Compute Sensitivity
sensitivity = TP / (TP + FN)

# Compute Specificity
specificity = TN / (TN + FP)

print("Sensitivity:", sensitivity)
print("Specificity:", specificity)


# Stop Spark session
spark.stop()




In [None]:
import time
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Neural Network Evaluation Example") \
    .getOrCreate()

# Load the car insurance dataset
final_numeric_car_df = spark.read.csv("final_numeric_car_df.csv", header=True, inferSchema=True)

# Check if the label column exists in the dataset
if "Claim_Flag" not in final_numeric_car_df.columns:
    raise ValueError("Label column 'Claim_Flag' not found in the dataset.")

# Check if the feature columns exist in the dataset
feature_cols = [col for col in final_numeric_car_df.columns if col != "Claim_Flag"]
if not feature_cols:
    raise ValueError("No feature columns found in the dataset.")

# Define the feature assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
final_numeric_car_df = assembler.transform(final_numeric_car_df)

# Split the dataset into training and test sets
(training_data, test_data) = final_numeric_car_df.randomSplit([0.7, 0.3], seed=42)

# Define the neural network classifier
nn_classifier = MultilayerPerceptronClassifier(layers=[len(feature_cols), 5, 2], seed=42, labelCol="Claim_Flag")

# Define the evaluator for accuracy calculation
evaluator = MulticlassClassificationEvaluator(labelCol="Claim_Flag", predictionCol="prediction", metricName="accuracy")

# Define the parameter grid for tuning
paramGrid = ParamGridBuilder() \
    .addGrid(nn_classifier.maxIter, [10, 50]) \
    .addGrid(nn_classifier.stepSize, [0.01, 0.1]) \
    .build()

# Define cross-validation
crossval = CrossValidator(estimator=nn_classifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Train and tune the model
start_time = time.time()
cv_model = crossval.fit(training_data)
end_time = time.time()

# Calculate training time
training_time = end_time - start_time
print("Training Time:", training_time)

# Get the best model from cross-validation
best_model = cv_model.bestModel

# Make predictions on the test data
predictions = best_model.transform(test_data)

# Calculate accuracy
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

# Get the best model's parameters
best_max_iter = best_model.getMaxIter()
best_step_size = best_model.getStepSize()
print("Best Max Iteration:", best_max_iter)
print("Best Step Size:", best_step_size)

# Stop Spark session
spark.stop()


In [None]:
import time
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.mllib.evaluation import MulticlassMetrics

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Neural Network Evaluation Example") \
    .getOrCreate()

# Load the car insurance dataset
final_numeric_car_df = spark.read.csv("final_numeric_car_df.csv", header=True, inferSchema=True)

# Check if the label column exists in the dataset
if "Claim_Flag" not in final_numeric_car_df.columns:
    raise ValueError("Label column 'Claim_Flag' not found in the dataset.")

# Check if the feature columns exist in the dataset
feature_cols = [col for col in final_numeric_car_df.columns if col != "Claim_Flag"]
if not feature_cols:
    raise ValueError("No feature columns found in the dataset.")

# Define the feature assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
final_numeric_car_df = assembler.transform(final_numeric_car_df)

# Split the dataset into training and test sets
(training_data, test_data) = final_numeric_car_df.randomSplit([0.7, 0.3], seed=42)

# Define the neural network classifier
nn_classifier = MultilayerPerceptronClassifier(layers=[len(feature_cols), 5, 2], seed=42, labelCol="Claim_Flag")

# Train the neural network model
start_time = time.time()
nn_model = nn_classifier.fit(training_data)
end_time = time.time()

# Calculate training time
training_time = end_time - start_time
print("Training Time:", training_time)

# Make predictions on the test data
start_prediction_time = time.time()
predictions = nn_model.transform(test_data)
end_prediction_time = time.time()

# Calculate prediction time
prediction_time = end_prediction_time - start_prediction_time
print("Prediction Time:", prediction_time)

# Calculate AUC
binary_evaluator = BinaryClassificationEvaluator(labelCol="Claim_Flag")
auc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
print("AUC:", auc)


# Calculate sensitivity and specificity
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

# Stop Spark session
spark.stop()
