### Objective: 

#### Starting pyspark with spark nlp

In [1]:
#Import packages
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf

sc =SparkContext()
sqlContext = SQLContext(sc)

### Data

Copying the data to Hadoop

### Loading the data

In [2]:
# File location and type
file_location = r'Attachment_1635667433.csv'
file_type = "csv"# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","
df = sqlContext.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.count()

68046

### Data Transformation

In [3]:
df.printSchema()

root
 |-- UserName: string (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- TweetAt: string (nullable = true)
 |-- OriginalTweet: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [4]:
# Sample data
df.show(5)

+--------+------------+--------------------+----------+--------------------+---------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|
+--------+------------+--------------------+----------+--------------------+---------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|
+--------+------------+--------------------+----------+--------------------+---------+
only showing top 5 rows



In [5]:
# Removing unwanted columns
unwanted_cols = ['UserName', 'ScreenName', 'Location', 'TweetAt']
df1 = df.select([column for column in df.columns if column not in unwanted_cols])
df1.show(5)

+--------------------+---------+
|       OriginalTweet|Sentiment|
+--------------------+---------+
|@MeNyrbie @Phil_G...|  Neutral|
|advice Talk to yo...| Positive|
|Coronavirus Austr...| Positive|
|My food stock is ...|     null|
|                null|     null|
+--------------------+---------+
only showing top 5 rows



In [6]:
# Dropping nulls from the data
df1 = df1.dropna()
df1.count()

28617

In [7]:
# Count of sentiment classes
from pyspark.sql.functions import col
df1.groupBy("Sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|           Sentiment|count|
+--------------------+-----+
|            Positive| 7718|
|            Negative| 6857|
|             Neutral| 5224|
|  Extremely Positive| 4412|
|  Extremely Negative| 3751|
|   social distancing|    5|
|    N. Y. - April 10|    3|
| state governors ...|    2|
|             however|    2|
| supermarket workers|    2|
|        Stay with us|    2|
| but we also need...|    2|
| or click the lin...|    2|
| just ""selfish p...|    2|
|           of course|    2|
| not going to the...|    2|
| ecological collapse|    2|
|        Corona Virus|    2|
|            delivery|    2|
| Big Tech could b...|    1|
+--------------------+-----+
only showing top 20 rows



In [8]:
# Filtering sentiment classes
import pyspark.sql.functions as f
df2 = df1.where(f.col("Sentiment").isin(["Positive", "Negative", "Neutral","Extremely Positive","Extremely Negative"]))
df2.groupBy("Sentiment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|          Positive| 7718|
|          Negative| 6857|
|           Neutral| 5224|
|Extremely Positive| 4412|
|Extremely Negative| 3751|
+------------------+-----+



### Data splitting into training and testing data sets 

In [9]:
# 70% for training, 30% for testing
(trainingData, testData) = df2.randomSplit([0.7, 0.3], seed = 2021)

### Spark NLP model pipleine creation 

In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, StringIndexer, VectorAssembler, IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator# convert text column to spark nlp document


regexTokenizer = RegexTokenizer(inputCol="OriginalTweet", outputCol="words", pattern="\\W")
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
label_stringIdx = StringIndexer(inputCol = "Sentiment", outputCol = "label")
# Model definiton
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.0)# To convert index(integer) to corresponding class labels
label_to_stringIdx = IndexToString(inputCol="label", outputCol="sentiment_class")

# NLP pipeline definiton
nlp_pipeline = Pipeline(
    stages=[regexTokenizer, 
            stopwordsRemover, 
            countVectors, 
            label_stringIdx,
            lr,
            label_to_stringIdx])

### Model Creation using spark NLP pipeline 

In [12]:
# fit the pipeline on training data
pipeline_model = nlp_pipeline.fit(trainingData)

### Generate predictions on the test data 

In [13]:
# perform predictions on test data
predictions =  pipeline_model.transform(testData)

### Model Performance measurement 

In [14]:
# import evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.493003
Test Error = 0.506997 


In [15]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.524037
Test Error = 0.475963 


In [16]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.493003
Test Error = 0.506997 


### Conclusion: 