In [21]:
!pip install textblob
!pip install nltk
!pip install flair

<console>: 2: error: ';' expected but '.' found.

In [25]:
// Setting up spark session
import org.apache.spark.sql.SparkSession

val spark = SparkSession
  .builder()
  .appName("Twitter Sentiment Analysis")
  .getOrCreate()

import spark.implicits._


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7080c32b
import spark.implicits._


In [32]:
// Loading the dataset
val datasetPath = "./temp/twitter/data.csv" 
val df = spark.read
  .csv(datasetPath)
  .toDF("target","ids","date","flag","user","text")
 

df.show() // Show the first few rows of the DataFrame



+------+----------+--------------------+--------+---------------+--------------------+
|target|       ids|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|     0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|     0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|     0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|     0|1467811795|Mon Apr 06 22:20:...|NO_

datasetPath: String = ./temp/twitter/data.csv
df: org.apache.spark.sql.DataFrame = [target: string, ids: string ... 4 more fields]


In [45]:
// Cleaning the data
import org.apache.spark.sql.functions.udf
import scala.util.matching.Regex

// Define a function to remove URLs, Twitter handles, and hashtags
def cleanText: (String => String) = { text =>
    // define a pattern to remove the urls
    val urlPattern: Regex = new Regex("http\\S+|www\\S+|https\\S+")
    // define a pattern to remove the user names ( tags )
    val userPattern: Regex = new Regex("@\\w+")
    // define a pattern to remove the hashtags
    val hashtagPattern: Regex = new Regex("#\\w+")
    // clear urls
    val result = urlPattern.replaceAllIn(text, "")
    // clear tags
    val result2 = userPattern.replaceAllIn(result, "")
    // hashtags
    hashtagPattern.replaceAllIn(result2, "")
}

// Register the UserDefinedFunction
val cleanTextUDF = udf(cleanText)

// drop the flag column 

// Apply the UDF to clean the text column 
// and removing the flag,date,user columns and ids because we'll not use them
val cleanedDf = df.withColumn("cleanedText", cleanTextUDF($"text")).drop("flag","date","user","ids")


cleanedDf.show()


+------+--------------------+--------------------+
|target|                text|         cleanedText|
+------+--------------------+--------------------+
|     0|@switchfoot http:...|  - A that's a bu...|
|     0|is upset that he ...|is upset that he ...|
|     0|@Kenichan I dived...| I dived many tim...|
|     0|my whole body fee...|my whole body fee...|
|     0|@nationwideclass ...| no, it's not beh...|
|     0|@Kwesidei not the...| not the whole crew |
|     0|         Need a hug |         Need a hug |
|     0|@LOLTrish hey  lo...| hey  long time n...|
|     0|@Tatiana_K nope t...| nope they didn't...|
|     0|@twittera que me ...|     que me muera ? |
|     0|spring break in p...|spring break in p...|
|     0|I just re-pierced...|I just re-pierced...|
|     0|@caregiving I cou...| I couldn't bear ...|
|     0|@octolinz16 It it...| It it counts, id...|
|     0|@smarrison i woul...| i would've been ...|
|     0|@iamjazzyfizzle I...| I wish I got to ...|
|     0|Hollis' death sce...|Ho

import org.apache.spark.sql.functions.udf
import scala.util.matching.Regex
cleanText: String => String
cleanTextUDF: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda/0x000001c7842d0218@34dd8cd5,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
cleanedDf: org.apache.spark.sql.DataFrame = [target: string, text: string ... 1 more field]


In [47]:
// Tokenization

import org.apache.spark.ml.feature.Tokenizer

// Instantiate the tokenizer
val tokenizer = new Tokenizer().setInputCol("cleanedText").setOutputCol("words")

// Transform the dataset
val tokenizedDf = tokenizer.transform(cleanedDf)

tokenizedDf.show(true)


+------+--------------------+--------------------+--------------------+
|target|                text|         cleanedText|               words|
+------+--------------------+--------------------+--------------------+
|     0|@switchfoot http:...|  - A that's a bu...|[, , -, a, that's...|
|     0|is upset that he ...|is upset that he ...|[is, upset, that,...|
|     0|@Kenichan I dived...| I dived many tim...|[, i, dived, many...|
|     0|my whole body fee...|my whole body fee...|[my, whole, body,...|
|     0|@nationwideclass ...| no, it's not beh...|[, no,, it's, not...|
|     0|@Kwesidei not the...| not the whole crew |[, not, the, whol...|
|     0|         Need a hug |         Need a hug |      [need, a, hug]|
|     0|@LOLTrish hey  lo...| hey  long time n...|[, hey, , long, t...|
|     0|@Tatiana_K nope t...| nope they didn't...|[, nope, they, di...|
|     0|@twittera que me ...|     que me muera ? |[, que, me, muera...|
|     0|spring break in p...|spring break in p...|[spring, break

import org.apache.spark.ml.feature.Tokenizer
tokenizer: org.apache.spark.ml.feature.Tokenizer = tok_914285766876
tokenizedDf: org.apache.spark.sql.DataFrame = [target: string, text: string ... 2 more fields]


In [49]:
// Removing Stop Words
import org.apache.spark.ml.feature.StopWordsRemover

// Instantiate the StopWordsRemover
val remover = new StopWordsRemover()
  .setInputCol("words")
  .setOutputCol("filteredWords")

// Transform the dataset
val filteredDf = remover.transform(tokenizedDf)

filteredDf.show(true)



+------+--------------------+--------------------+--------------------+--------------------+
|target|                text|         cleanedText|               words|       filteredWords|
+------+--------------------+--------------------+--------------------+--------------------+
|     0|@switchfoot http:...|  - A that's a bu...|[, , -, a, that's...|[, , -, bummer., ...|
|     0|is upset that he ...|is upset that he ...|[is, upset, that,...|[upset, update, f...|
|     0|@Kenichan I dived...| I dived many tim...|[, i, dived, many...|[, dived, many, t...|
|     0|my whole body fee...|my whole body fee...|[my, whole, body,...|[whole, body, fee...|
|     0|@nationwideclass ...| no, it's not beh...|[, no,, it's, not...|[, no,, behaving,...|
|     0|@Kwesidei not the...| not the whole crew |[, not, the, whol...|     [, whole, crew]|
|     0|         Need a hug |         Need a hug |      [need, a, hug]|         [need, hug]|
|     0|@LOLTrish hey  lo...| hey  long time n...|[, hey, , long, t...

import org.apache.spark.ml.feature.StopWordsRemover
remover: org.apache.spark.ml.feature.StopWordsRemover = StopWordsRemover: uid=stopWords_57fcf0d1a32a, numStopWords=181, locale=fr_FR, caseSensitive=false
filteredDf: org.apache.spark.sql.DataFrame = [target: string, text: string ... 3 more fields]
