In [33]:
!pip install pyspark



In [34]:
!pip install pyarrow ## pandas dataframe



In [35]:
# Import libraries

from pyspark.sql import SparkSession

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# Initializing Spark Session Object
spark = SparkSession.builder.master('local').getOrCreate()

In [37]:
path = '/content/drive/MyDrive/Marketing-Analytics/Data/MarketingData.csv'

In [38]:
df1=spark.read.csv(path,inferSchema=True, header=True)

In [39]:
df1.show(6)

+-----------+-----------+-----------+-----------+----------------+------------+-------+
|      Fresh|       Milk|    Grocery|     Frozen|Detergents_Paper|Delicatessen|Channel|
+-----------+-----------+-----------+-----------+----------------+------------+-------+
|1258.486572|882.0949184|1264.002044|963.8265613|     1238.718727| 984.0546399|      2|
|1072.083074|932.7786504| 831.671388|725.9034932|     734.7518351| 822.2104209|      2|
|1005.494853|1061.499289| 933.310379|831.0086235|      685.732184| 928.6364361|      2|
|1063.093306|760.7777117|625.3498674|588.6234627|     757.9455575| 811.0418684|      0|
|974.0717208|961.5438527|800.4296881| 806.499716|     804.1985346| 968.5234557|      2|
|905.8346328|868.4050084|675.6687672|956.3530948|     1259.050295| 1102.942172|      1|
+-----------+-----------+-----------+-----------+----------------+------------+-------+
only showing top 6 rows



In [40]:
Pred_corr= ['Fresh','Milk',
'Grocery','Frozen','Detergents_Paper']

In [41]:
vector_col = "Predictors"
assembler = VectorAssembler(inputCols=Pred_corr, 
                            outputCol=vector_col) ##assemble the inputs and outputs

In [42]:
output = assembler.transform(df1)

In [43]:
vector=output.select("Predictors","Channel") ## X variables, target/response variable (Y)

In [44]:
vector.show()

+--------------------+-------+
|          Predictors|Channel|
+--------------------+-------+
|[1258.486572,882....|      2|
|[1072.083074,932....|      2|
|[1005.494853,1061...|      2|
|[1063.093306,760....|      0|
|[974.0717208,961....|      2|
|[905.8346328,868....|      1|
|[1152.04602,973.0...|      2|
|[725.6510501,1058...|      1|
|[1190.77797,548.7...|      0|
|[1199.191923,1003...|      3|
|[891.8380754,1021...|      1|
|[1222.821566,677....|      3|
|[712.9776237,1132...|      2|
|[1151.591037,907....|      1|
|[1256.165799,840....|      2|
|[726.2007856,1114...|      2|
|[994.0482883,1030...|      2|
|[921.6674254,717....|      0|
|[1295.051531,898....|      2|
|[1082.276965,661....|      0|
+--------------------+-------+
only showing top 20 rows



In [45]:
###split data into training and testing
train_data,test_data=vector.randomSplit([0.7,0.3]) ##70% for training

In [46]:
from pyspark.ml.classification import RandomForestClassifier

In [47]:
regressor=RandomForestClassifier(featuresCol="Predictors",labelCol="Channel") ## x,y

In [48]:
##predict on training dataset
rf_model = regressor.fit(train_data) ##.fit

In [49]:
## make predictions
predictions = rf_model.transform(test_data) ## implement the rf model on testing data using 'transform'
rfPredictions = predictions.select("Channel", "prediction", "probability")
rfPredictions.show(10)

+-------+----------+--------------------+
|Channel|prediction|         probability|
+-------+----------+--------------------+
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.03107504984803...|
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.10252445978863...|
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.04980445643784...|
|      3|       3.0|[0.04980445643784...|
+-------+----------+--------------------+
only showing top 10 rows



In [50]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [51]:
# Evaluating accuracy of our model

evaluator = MulticlassClassificationEvaluator() \
                      .setLabelCol("Channel") \
                      .setPredictionCol("prediction") \
                      .setMetricName("accuracy")

accuracy = evaluator.evaluate(predictions)

print("Test Error : {}".format(100*(1.0 - accuracy)))
print("Accuracy of the model : {}".format(100*accuracy))

Test Error : 25.702341137123742
Accuracy of the model : 74.29765886287626
