### Import the necessaries libraries

In [0]:
from pyspark.ml.feature import StringIndexer, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F, Row
import numpy as np
from sklearn import svm

### Read the dataset

In [0]:
df = spark.read.parquet('/mnt/2024-team20/labelled_datasets_parquet')

### Create a pipeline

This pipeline comprises several stages for processing text data:
- **Word2Vec**: Converts lemmatized tokens into vectors of numerical features, facilitating machine learning on textual input.
- **StringIndexer**: Encodes string sentiment labels into numerical indices, a necessary step for model training.

In [0]:
word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol='lemmatized_tokens', outputCol='features')

stringIndexer = StringIndexer(inputCol='sentiment', outputCol = 'label')

encoding_pipeline = Pipeline(stages=[word2Vec, stringIndexer])

### Split the data into Train and Test

In [0]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 0)

### Fit the pipeline on the Training Data

In [0]:
pipeline = encoding_pipeline.fit(train_df)


### Transform Train and Test Data for Machine Learning

In [0]:
model_data_train = pipeline.transform(train_df).select(F.col('features'), F.col('label'))

model_data_test = pipeline.transform(test_df).select(F.col('features'), F.col('label'))

### Optimize Data Distribution by Partitioning the Train Dataset

In [0]:
model_data_train_rdd = model_data_train.rdd.repartition(18)

print(model_data_train_rdd.getNumPartitions())

18


### Define a Model Training Function

In [0]:
def build_model(partition_iter):
    partition_data = list(partition_iter)
    X_train = np.array([row.features.toArray() for row in partition_data])
    Y_train = np.array([row.label for row in partition_data])
    svm_classifier = svm.LinearSVC(random_state=0)
    model = svm_classifier.fit(X_train, Y_train)
    return [model]

### Train the Model on Each Partition and Collect the Results

In [0]:
models = model_data_train_rdd.mapPartitions(build_model).collect()

### Define a Prediction Function

In [0]:
def predict(instance):
    inst_features = np.array(instance[:-1]).reshape(1, -1)
    predictions = [model.predict(inst_features)[0] for model in models]
    return predictions

### Define a Prediction Aggregation Function

In [0]:
def transform(instance):
    predictions = predict(instance)
    prediction_mode = max(set(predictions), key=predictions.count)
    return Row(**instance.asDict(), raw_prediction=float(prediction_mode))

### Estimate Model Accuracy on Test Data

In [0]:
pred_df = model_data_test.rdd.map(transform).toDF()

evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'raw_prediction', metricName = 'accuracy')

accuracy = evaluator.evaluate(pred_df)

print(f'Accuracy = {accuracy}')

Accuracy = 0.676339445195674
