### Import the necessaries libraries

In [0]:
from pyspark.ml.feature import StringIndexer, Word2Vec
import numpy as np
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

### Read the dataset

In [0]:
df = spark.read.parquet('/mnt/2024-team20/labelled_datasets_parquet')

### Create a Pipeline

This pipeline comprises several stages for processing text data:
- **Word2Vec**: Converts lemmatized tokens into vectors of numerical features, facilitating machine learning on textual input.
- **StringIndexer**: Encodes string sentiment labels into numerical indices, a necessary step for model training.
- **OneVsRest (OVR)**: Applies a one-vs-rest strategy using an SVM (Support Vector Machine) classifier to handle multi-class classification tasks.


In [0]:
word2Vec = Word2Vec(vectorSize = 100, minCount = 0, inputCol = 'lemmatized_tokens', outputCol = 'features')

stringIndexer = StringIndexer(inputCol = 'sentiment', outputCol = 'label')

lsvc = LinearSVC(maxIter = 10, regParam = 0.1)

ovr = OneVsRest(classifier = lsvc, labelCol = 'label', featuresCol = 'features')

pipeline = Pipeline(stages = [word2Vec, stringIndexer, ovr])

### Split the data into Train and Test

In [0]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 0)

### Fit the Model on the Training Data

In [0]:
model = pipeline.fit(train_df)

### Save the Model to Storage

In [0]:
model.save('/mnt/2024-team20/model')

### Make Predictions on Test Data

The trained model is applied to the test dataset (`test_df`) to generate predictions.

In [0]:
predictions = model.transform(test_df)

### Estimate Model Accuracy on Test Data

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')

accuracy = evaluator.evaluate(predictions)

print('Test Accuracy:', accuracy)

Test Accuracy: 0.6497926694698848
