# Text Classification using MLlib

In [1]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/SMSSpamCollection.csv')

In [3]:
data.columns

['category', 'text']

In [4]:
data.show()

+--------+--------------------+
|category|                text|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
|    spam|FreeMsg Hey there...|
|     ham|Even my brother i...|
|     ham|As per your reque...|
|    spam|WINNER!! As a val...|
|    spam|Had your mobile 1...|
|     ham|I'm gonna be home...|
|    spam|SIX chances to wi...|
|    spam|URGENT! You have ...|
|     ham|I've been searchi...|
|     ham|I HAVE A DATE ON ...|
|    spam|XXXMobileMovieClu...|
|     ham|Oh k...i'm watchi...|
|     ham|Eh u remember how...|
|     ham|Fine if that¬ís t...|
|    spam|England v Macedon...|
+--------+--------------------+
only showing top 20 rows



In [5]:
data.printSchema()

root
 |-- category: string (nullable = true)
 |-- text: string (nullable = true)



In [6]:
from pyspark.sql.functions import col

# by top 20 categories
data.groupBy("category") \
    .count() \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|     ham| 4827|
|    spam|  747|
+--------+-----+



# Cross checking with Pandas

In [6]:
import pandas as pd

df = pd.read_csv('../data/SMSSpamCollection.csv')

In [7]:
df['category'].value_counts()

ham     4827
spam     747
Name: category, dtype: int64

In [32]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from nltk.corpus import stopwords

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# stop words
add_stopwords = stopwords.words('english')

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [33]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

In [34]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

In [35]:
dataset.show()

+--------+--------------------+--------------------+--------------------+--------------------+-----+
|category|                text|               words|            filtered|            features|label|
+--------+--------------------+--------------------+--------------------+--------------------+-----+
|     ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(1714,[10,14,35,5...|  0.0|
|     ham|Ok lar... Joking ...|[ok, lar, joking,...|[ok, lar, joking,...|(1714,[0,8,247,36...|  0.0|
|    spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|(1714,[2,9,21,22,...|  1.0|
|     ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|(1714,[0,53,78,79...|  0.0|
|     ham|Nah I don't think...|[nah, i, don, t, ...|[nah, think, goes...|(1714,[49,131,361...|  0.0|
|    spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|(1714,[8,13,19,24...|  1.0|
|     ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|(1714,[13,123,279.

In [36]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3964
Test Dataset Count: 1610


# Logistic Regression using Count Vector Features

In [37]:
# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [38]:
predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("text","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          text|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|The last thing i ever wante...|     ham|[0.9936283579085395,0.00637...|  0.0|       0.0|
|He neva grumble but i sad l...|     ham|[0.9903902270119437,0.00960...|  0.0|       0.0|
|Oh... Haha... Den we shld h...|     ham|[0.9893446221699386,0.01065...|  0.0|       0.0|
|Heart is empty without love...|     ham|[0.9877406605014499,0.01225...|  0.0|       0.0|
|Havent planning to buy late...|     ham|[0.9872145503164064,0.01278...|  0.0|       0.0|
|Wen ur lovable bcums angry ...|     ham|[0.9870260688139731,0.01297...|  0.0|       0.0|
|THING R GOOD THANX GOT EXAM...|     ham|[0.9868148796206359,0.01318...|  0.0|       0.0|
|Sounds like there could be ...|     ham|[0.9861182389194111,0.01388...|  0.0|       0.0|
|Although 

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9645509258772988

# Logistic Regression using TF-IDF Features

In [40]:
from pyspark.ml.feature import HashingTF, IDF

# Add HashingTF and IDF to transformation
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

# Redo Pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [41]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [42]:
predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("text","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          text|category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|THING R GOOD THANX GOT EXAM...|     ham|[0.9894476412953456,0.01055...|  0.0|       0.0|
|He neva grumble but i sad l...|     ham|[0.9887095930738091,0.01129...|  0.0|       0.0|
|Although i told u dat i'm i...|     ham|[0.9877546446366153,0.01224...|  0.0|       0.0|
|Wen ur lovable bcums angry ...|     ham|[0.9873871799731352,0.01261...|  0.0|       0.0|
|"And that is the problem. Y...|     ham|[0.9870440089341924,0.01295...|  0.0|       0.0|
|U say leh... Of course noth...|     ham|[0.9870247728278815,0.01297...|  0.0|       0.0|
|Honeybee Said: *I'm d Sweet...|     ham|[0.9865047862576719,0.01349...|  0.0|       0.0|
|Cos i was out shopping wif ...|     ham|[0.986161453417046,0.013838...|  0.0|       0.0|
|Hi neva w

In [43]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9645509258772988

# Custom text

#### Example of how to create a spark dataframe

```python
from pyspark.sql import Row
l = [('Ankit',25),('Jalfaizy',22),('saurabh',20),('Bala',26)]
rdd = sc.parallelize(l)
people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
schemaPeople = sqlContext.createDataFrame(people)

print(type(schemaPeople))
#  pyspark.sql.dataframe.DataFrame
```


In [97]:
from pyspark.sql import Row
ll = [('Hurry up! Answer simple questions and WINNER will get $900 prize reward! To claim call us. Valid 12 hours only.'),('Hey, How are you? Long time no see')]
rdds = sc.parallelize(ll)
tx = rdds.map(lambda x: Row(text=x))
schematxt = sqlContext.createDataFrame(tx)

In [98]:
schematxt.show()

+--------------------+
|                text|
+--------------------+
|Hurry up! Answer ...|
|Hey, How are you?...|
+--------------------+



## Calculating features for test sample data

In [99]:
test_new_dataset = pipelineFit.transform(schematxt)
test_new_dataset.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|               words|            filtered|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Hurry up! Answer ...|[hurry, up, answe...|[hurry, answer, s...|(10000,[1,721,727...|(10000,[1,721,727...|
|Hey, How are you?...|[hey, how, are, y...|[hey, long, time,...|(10000,[7515,8157...|(10000,[7515,8157...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



## Predicting on calculated features of test sample data

In [100]:
test_pred = lrModel.transform(test_new_dataset)
# test_pred.filter(test_pred['prediction'] == 0) \
#     .select("text","probability","prediction") \
#     .orderBy("probability", ascending=False)

In [101]:
test_pred.select("text","probability","prediction").show()

+--------------------+--------------------+----------+
|                text|         probability|prediction|
+--------------------+--------------------+----------+
|Hurry up! Answer ...|[0.40530136808716...|       1.0|
|Hey, How are you?...|[0.96833775344934...|       0.0|
+--------------------+--------------------+----------+



# Exercise

# Build Model using Naive Bayes algorithm

### and predict on sample text

In [102]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)

In [103]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9731847068395234

## prediction on sample dataset

In [104]:
test_pred.select("text","probability","prediction").show()

+--------------------+--------------------+----------+
|                text|         probability|prediction|
+--------------------+--------------------+----------+
|Hurry up! Answer ...|[0.40530136808716...|       1.0|
|Hey, How are you?...|[0.96833775344934...|       0.0|
+--------------------+--------------------+----------+



As you can see Naive Bayes performed well and it's able to identify message as spam! Congrats!

### Also you could try Random Forest. See how well it's performing?

# Cross Validation

In [20]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9749834606590078