#Basic Text classification using PySpark ML Pipeline
1. Imports
2. Load Data<br>&emsp;2.1 Split data<br>
3. Test Pipeline components <br>&emsp;3.1 Tokenize <br>&emsp;3.2 Count Vectorizer <br>&emsp;3.3 Logistic Regression <br>
4. Testing Pipeline<br>
5. Pipeline + Cross-validation + Tuning  **<======== Final pipeline**<br>&emsp;5.1 Define pipeline <br>&emsp;5.2 Define Hyper-parameter grid <br>&emsp;5.3 Define Cross validation params <br>&emsp;5.4 Get best CV model <br>&emsp;5.5 Predictions form best CV model <br>&emsp;5.6 Compute Eval metrics

## 1. Imports

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark import pandas as ps
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sklearn.metrics import classification_report

In [None]:
spark = SparkSession.builder.getOrCreate()

## 2. Load Data

In [None]:
# Load file
data_sparkdf = df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/{user-directory}/ecommerceDataset_v2.csv")

In [None]:
data_sparkdf.printSchema()

In [None]:
data_sparkdf.show(2)

In [None]:
data_sparkdf.dtypes

In [None]:
data_sparkdf.select('prod_category').distinct().collect()

In [None]:
data_sparkdf = data_sparkdf.filter((data_sparkdf.prod_category  == "Household") | (data_sparkdf.prod_category  == "Books") |(data_sparkdf.prod_category  == "Clothing & Accessories") | (data_sparkdf.prod_category  == "Electronics"))
data_sparkdf = data_sparkdf.filter(data_sparkdf.prod_title_desc.isNotNull())

In [None]:
data_sparkdf.select('prod_category').distinct().collect()

### 2.1 Split data

In [None]:
### Split data into Train & Test [90%,10%]
train_sparkdf, test_sparkdf = data_sparkdf.randomSplit([0.9, 0.1], seed=123)

In [None]:
print((train_sparkdf.count(), len(train_sparkdf.columns)))

## 3. Test Pipeline components

### 3.1 Tokenize

In [None]:
### Tokenize
tokenizer = Tokenizer(inputCol="prod_title_desc", outputCol="words")
data_token_sparkdf = tokenizer.transform(data_sparkdf)

In [None]:
data_token_sparkdf.show(2)

In [None]:
data_token_sparkdf.dtypes

### 3.2 Count Vectorizer

In [None]:
countVect = CountVectorizer(inputCol="words", outputCol="features", minDF=2.0,vocabSize=10000)
cv_model = countVect.fit(data_token_sparkdf)

In [None]:
data_cv_sparkdf = cv_model.transform(data_token_sparkdf)

### 3.3 Logistic Regression

In [None]:
### Label conversion into numerical varaible
indexer = StringIndexer(inputCol='prod_category', outputCol='label').fit(data_cv_sparkdf)
data_label_sparkdf = indexer.transform(data_cv_sparkdf)

In [None]:
data_label_sparkdf.select('label').distinct().collect()

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.001)
lr.fit(data_label_sparkdf)
lr

## 4. Testing Pipeline

In [None]:
### Tokenize text
tokenizer = Tokenizer(inputCol="prod_title_desc", outputCol="words")
### Count Vectorizer BOW
countVect = CountVectorizer(inputCol="words", outputCol="features", minDF=2.0,vocabSize=10000)
### Label column encoding
indexer = StringIndexer(inputCol='prod_category', outputCol='label')
### Logistic regression model
lr = LogisticRegression(maxIter=10, regParam=0.001)

### Define pipeline
pipeline = Pipeline(stages=[tokenizer,countVect,indexer,lr])


In [None]:
# Fit the pipeline to training documents.
model = pipeline.fit(train_sparkdf)

In [None]:
result_sparkdf = model.transform(test_sparkdf).select("features","prod_category", "label", "prediction","probability")
result_sparkdf.show(10)
result_df = result_sparkdf.toPandas()

In [None]:
print(classification_report(result_df.label, result_df.prediction))
# result_df

## 5. Pipeline + Cross-validation + Tuning

### 5.1 Define Pipeline

In [None]:
### Tokenize text
tokenizer = Tokenizer(inputCol="prod_title_desc", outputCol="words")
### Count Vectorizer BOW
countVect = CountVectorizer(inputCol="words", outputCol="features")
### Label column encoding
indexer = StringIndexer(inputCol='prod_category', outputCol='label')
### Logistic regression model
lr = LogisticRegression(maxIter=10)

### Define pipeline
pipeline = Pipeline(stages=[tokenizer,countVect,indexer,lr])


### 5.2 Define Hyper-parameter grid

In [None]:
### Grid hyper parameter search
paramGrid = ParamGridBuilder() \
    .addGrid(countVect.minDF, [2.0, 5.0]) \
    .addGrid(countVect.vocabSize, [10000, 20000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

### 5.3 Define Cross validation params

In [None]:
### Cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)

### 5.4 Get best CV model

In [None]:
### Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_sparkdf)

### 5.5 Predictions from best CV model

In [None]:
### Make predictions on test documents. cvModel uses the best model found 
prediction = cvModel.transform(test_sparkdf).select("features","prod_category", "label", "prediction","probability")
prediction.show(10)

### 5.6 Compute Eval metrics

In [None]:
### Convert to Pandas DF to compute evaluation metrics
result_df = prediction.toPandas()
print(classification_report(result_df.label, result_df.prediction))