In [38]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('CoronavirusNLP').getOrCreate()

In [39]:
df=spark.read.csv('Corona_NLP_train.csv',header=True,inferSchema=True)

In [40]:
df.show(5)

+--------+------------+--------------------+----------+--------------------+---------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|
+--------+------------+--------------------+----------+--------------------+---------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|
+--------+------------+--------------------+----------+--------------------+---------+
only showing top 5 rows



In [41]:
df.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [42]:
print((df.count(),len(df.columns)))

(68046, 6)


In [43]:
#df.shape.show()

# Data Preparation

In [44]:
from pyspark.sql.functions import length

In [45]:
df=df.withColumn('Tweet_length', length(df['OriginalTweet']))

In [46]:
df.show(5)

+--------+------------+--------------------+----------+--------------------+---------+------------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+------------+--------------------+----------+--------------------+---------+------------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|          51|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|        null|
+--------+------------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [47]:
sentiments=['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [48]:
data=df.filter(df.Sentiment.isin(sentiments))

In [49]:
data.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



In [50]:
data.select('Sentiment').distinct().count()

5

In [51]:
data.groupby('Sentiment').count().show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|Extremely Negative| 3751|
|           Neutral| 5224|
|          Positive| 7718|
|          Negative| 6857|
|Extremely Positive| 4412|
+------------------+-----+



In [52]:
data.show(5)

+--------+----------+--------------------+----------+--------------------+---------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+---------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...| Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...| Positive|         184|
+--------+----------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [53]:
print((data.count(),len(data.columns)))

(27962, 7)


In [58]:
from pyspark.sql.functions import isnan,when,count,col
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]
   ).show()

+--------+----------+--------+-------+-------------+---------+------------+
|UserName|ScreenName|Location|TweetAt|OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------+-------+-------------+---------+------------+
|       0|         0|    6152|      0|            0|        0|           0|
+--------+----------+--------+-------+-------------+---------+------------+



In [23]:
#tempdata=df.select(['UserName', 'Sentiment'])

In [24]:
#tempdata.show(5)

In [25]:
#df.groupby('Sentiment').count().show()

In [59]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer,RegexTokenizer

In [71]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType
#regextokenizer=RegexTokenizer(inputCol="OriginalTweet", outputCol="words",pattern="\\W")
tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_text")
stopremove=StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
countvec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")
#counttokens=udf(lambda words:len(words),IntegerType())

#Convert labels to numeric
labeltonum=StringIndexer(inputCol="Sentiment", outputCol="label")

In [72]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [75]:
cleaned=VectorAssembler(inputCols=["tf_idf","Tweet_length"], outputCol="cleaned_features")

In [28]:
#tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="tokenized_text")
#counttokens=udf(lambda tokenized_text:len(tokenized_text),IntegerType())

In [66]:
#tokenized.select("OriginalTweet","words").withColumn("NoOfWords",counttokens(col("words"))).show(10)

+--------------------+--------------------+---------+
|       OriginalTweet|               words|NoOfWords|
+--------------------+--------------------+---------+
|@MeNyrbie @Phil_G...|[menyrbie, phil_g...|       17|
|advice Talk to yo...|[advice, talk, to...|       38|
|Coronavirus Austr...|[coronavirus, aus...|       18|
|As news of the re...|[as, news, of, th...|       41|
|"Cashier at groce...|[cashier, at, gro...|       33|
|Due to COVID-19 o...|[due, to, covid, ...|       50|
|For corona preven...|[for, corona, pre...|       44|
|All month there h...|[all, month, ther...|       43|
|#horningsea is a ...|[horningsea, is, ...|       48|
|ADARA Releases CO...|[adara, releases,...|       30|
+--------------------+--------------------+---------+
only showing top 10 rows



In [69]:
#data.show(5)

+--------+----------+--------------------+----------+--------------------+---------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+---------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...| Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...| Positive|         184|
+--------+----------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [64]:
#tokenized.select("OriginalTweet","words").withColumn("NoOfWords",counttokens(col("words"))).show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|OriginalTweet                                                                                                                                                                                                                                                                                   |words                                                                        