<a href="https://colab.research.google.com/github/vaanchhitbaranwal-ux/vaanchhit/blob/main/IT_Support_Analyst_cloud_sentiment_pipeline_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules
print("Running in Colab:", IN_COLAB)

In [None]:
!pip install pyspark gcsfs

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Sentiment Analysis") \
    .getOrCreate()

In [None]:
import gcsfs

bucket_name = "cis-415-project-jedwardr"
file_name = "sentiment_small_dataset.csv"

gcs_path = f"gs://{bucket_name}/{file_name}"
local_path = f"/content/{file_name}"

# Download file from GCS to local Colab environment
fs = gcsfs.GCSFileSystem()
with fs.open(gcs_path, 'rb') as f:
    with open(local_path, 'wb') as out_file:
        out_file.write(f.read())

# Load the local CSV file into Spark
df = spark.read.csv(local_path, header=True, inferSchema=True)
df.show(5)
df.printSchema()

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

In [None]:
# Exploratory Data Analysis (EDA)
df.describe().show()
# Check for missing values
df.select([col(c).isNull().alias(c) for c in df.columns]).show()

In [None]:
# Data Preprocessing
df = df.na.drop()  # Drop rows with null values
df = df.withColumn('Sentiment_Score', col('Sentiment_Score').cast('int'))  # Ensure target is int
# Tokenizing the 'Feedback_Text' column
tokenizer = Tokenizer(inputCol='Feedback_Text', outputCol='words')
# Vectorizing the words column
vectorizer = CountVectorizer(inputCol='words', outputCol='features')
# Indexing the target variable 'Sentiment_Score'
indexer = StringIndexer(inputCol='Sentiment_Score', outputCol='label')

In [None]:
# Train/Test Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
train_df.show(5)

In [None]:
# Train Logistic Regression Model
lr = LogisticRegression(maxIter=10, regParam=0.01)
lr_pipeline = Pipeline(stages=[tokenizer, vectorizer, indexer, lr])

lr_model = lr_pipeline.fit(train_df)
lr_predictions = lr_model.transform(test_df)

lr_predictions.select('Feedback_Text', 'Sentiment_Score', 'prediction').show(5)

In [None]:
# Train Naive Bayes Model
nb = NaiveBayes(modelType='multinomial', labelCol='label', featuresCol='features')
nb_pipeline = Pipeline(stages=[tokenizer, vectorizer, indexer, nb])

nb_model = nb_pipeline.fit(train_df)
nb_predictions = nb_model.transform(test_df)

In [None]:
# Model Evaluation (Accuracy + F1)

# Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
lr_accuracy = evaluator.evaluate(lr_predictions)
nb_accuracy = evaluator.evaluate(nb_predictions)

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

# F1 Score
f1_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
lr_f1 = f1_eval.evaluate(lr_predictions)
nb_f1 = f1_eval.evaluate(nb_predictions)

print(f"Logistic Regression F1 Score: {lr_f1:.4f}")
print(f"Naive Bayes F1 Score: {nb_f1:.4f}")