# Sentiment Analysis with Pyspark

## Naive Bayes

All the libraries and classes needed to perform data processing and text classification tasks using PySpark and other related libraries are imported.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.sql import SQLContext
import pyspark as ps
from pyspark.ml.classification import LinearSVC
import warnings
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd

The following function is used to initialize a Spark session in local mode with certain configurations (4 threads on my PC) and returns the Spark session so that it can be used in the main code.

In [None]:
def init_spark():
    print("Initializing Spark...")
    try:
        sc = ps.SparkContext('local[4]')
        sqlContext = SQLContext(sc)
        print("Just created a SparkContext")
    except ValueError:
        warnings.warn("SparkContext already exists in this scope")
    spark = SparkSession.builder.getOrCreate()
    return spark

This function is used to read a CSV file into Spark and load it into a DataFrame. The resulting DataFrame can then be used to perform data processing and analysis operations in Spark.

In [None]:
def read_file(fileUrl, spark):
    print("Reading CSV file...")
    df = spark.read.csv(fileUrl, sep=",", inferSchema=True, header=False)
    return df

The following function performs a series of text preprocessing operations on a PySpark DataFrame, including character cleansing, tokenization, stopword removal, stemming, and numeric feature extraction, thereby preparing the data for further analysis or machine learning modeling.

In [None]:
def pre_process(df):
    print("Preprocessing...")
    df.count()
    
    df = df.withColumnRenamed('_c0', "id").withColumnRenamed('_c1', 'label').withColumnRenamed('_c2', 'tweet')
    
    df = df.withColumn('tweet', regexp_replace('tweet', '[^a-z0-9A-Z`~!@#$%&<>?., ]', ''))
    df = df.withColumn('tweet', regexp_replace('tweet', '[0-9`~!@#$%&<>?,\']', ''))
    df = df.withColumn('tweet', regexp_replace('tweet', 'http://*.*.com', ''))
    df = df.withColumn('tweet', regexp_replace('tweet', 'www.*.com', ''))
    df = df.withColumn('tweet', regexp_replace('tweet', '\.', ''))
    
    tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
    wordData = tokenizer.transform(df)
    
    remover = StopWordsRemover(inputCol="words", outputCol="word_clean")
    word_clean_data = remover.transform(wordData)
    
    stemmer = SnowballStemmer(language='english')
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens])
    
    count = CountVectorizer(inputCol="word_clean", outputCol="rawFeatures")
    model = count.fit(word_clean_data)
    
    featurizedData = model.transform(word_clean_data)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    return rescaledData

This function is used to split a PySpark DataFrame into training and test sets, which is a common operation in machine learning to evaluate model performance.

In [None]:
def train_test_split(df):
    print("Splitting dataset...")
    seed = 0
    trainDf, testDf = df.randomSplit([0.7, 0.3], seed)
    trainDf.count()
    testDf.count()
    return trainDf, testDf

The following function takes the predictions from the training set and the test set, and displays detailed tables indicating how many instances were correctly predicted (label match) and how many were incorrectly predicted (difference between the actual label and the predicted label) for each. set.

In [None]:
def details_table(train_predictions, test_predictions):
    print("Training Table...")
    train_predictions.groupBy('label','prediction').count().show()

    print("Test Table...")
    test_predictions.groupBy('label','prediction').count().show()

This function is used to calculate and summarize various common evaluation metrics for a classification model from a set of predictions.
Metrics included are area under the ROC curve, F1 metric, and precision.

In [None]:
def evaluate_model(predictions, labelCol="label", predictionCol="prediction"):
    evaluator = BinaryClassificationEvaluator(rawPredictionCol=predictionCol, labelCol=labelCol, metricName="areaUnderROC")
    roc = evaluator.evaluate(predictions)
    
    evaluator = MulticlassClassificationEvaluator(predictionCol=predictionCol, labelCol=labelCol, metricName="f1")
    f1 = evaluator.evaluate(predictions)
    
    evaluator = MulticlassClassificationEvaluator(predictionCol=predictionCol, labelCol=labelCol, metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    
    return {"ROC": roc, "F1": f1, "Accuracy": accuracy}

The following function implements the training and evaluation of a Naive Bayes classification model with hyperparameter search using cross-validation in Spark MLlib.

In [None]:
def naive_Bayes(train_data, test_data):
    print("Using Naive Bayes model with train and test data...")
    
    nb = NaiveBayes()
    paramGrid_nb = ParamGridBuilder() \
        .addGrid(nb.smoothing, np.linspace(0.3, 10, 10)) \
        .build()
    
    crossval_nb = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid_nb, evaluator=BinaryClassificationEvaluator(), numFolds=5)
    cvModel_nb = crossval_nb.fit(train_data)
    
    train_predictions = cvModel_nb.transform(train_data)
    train_summary = evaluate_model(train_predictions)
    
    test_predictions = cvModel_nb.transform(test_data)
    test_summary = evaluate_model(test_predictions)

    details_table(train_predictions, test_predictions)
    
    return train_summary, test_summary

The next cell initializes a Spark session, which is required to work with Spark and run distributed data processing tasks in a cluster.
A CSV file is then uploaded, performs preprocessing on the data, and then splits the data into training and test sets.

In [None]:
spark = init_spark()

url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSoAtoF35YFTr-hDjec2LdghHmryR7nSpmolXIuhNh-CCTTY9oCQq4gUlBLCu86oWslt6nqV_Z1jGcJ/pub?output=csv"

df = read_file(url, spark)
df = pre_process(df)

train_data, test_data = train_test_split(df)

Finally, a series of training experiments and evaluation of machine learning models are carried out using the algorithms Naive Bayes in the training and test sets.
Training and test summaries are printed for each algorithm and data set, allowing you to assess the performance of your models on different evaluation metrics.

In [None]:
train_summary, test_summary = naive_Bayes(train_data, test_data)

print("Naive Bayes Train Summary:")
print(train_summary)
print("")
print("Naive Bayes Test Summary:")
print(test_summary)