In [1]:
# !pip install sparktorch
# !pip install pyspark
# !pip install demoji

In [2]:
from typing import Text
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from sparktorch import SparkTorch, serialize_torch_obj
from pyspark.sql.functions import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.pipeline import Pipeline, PipelineModel
from sparktorch import PysparkPipelineWrapper
from pyspark.sql.types import *
import torch
import torch.nn as nn

import pickle


from Classifier import lstmClassifier
from TextTransformer import TextTransformer


Downloading emoji data ...
... OK (Got response in 0.82 seconds)
Writing emoji data to C:\Users\ASUS\.demoji\codes.json ...
... OK


In [3]:
spark = SparkSession.builder \
    .appName("examples") \
    .master('local[3]')\
    .getOrCreate()



In [5]:
# Read in mnist_train.csv dataset
schema = StructType([StructField('free_text', StringType(), True), StructField('label_id', DoubleType(), True)])
# df = spark.read.schema(schema).option('header', 'true').csv('uit-hsd/train.csv').repartition(3)


train = spark.read.schema(schema)\
.option('header', True)\
.csv('uit-hsd/train.csv')

test = spark.read.schema(schema)\
.option('header', True)\
.csv('uit-hsd/test.csv')


train = train.where((train.label_id == 0) | (train.label_id == 1) | (train.label_id  == 2))

test = test.where((test.label_id == 0) | (test.label_id == 1) | (test.label_id  == 2))

train = train.withColumn('label_id', when(train.label_id == 2, 1).otherwise(train.label_id))
test = test.withColumn('label_id', when(test.label_id == 2, 1).otherwise(test.label_id))

In [4]:
# over sampling the train set to deal with imbalance class
new_train = train.withColumn('n',when(train.label_id == 1, 4).otherwise(1))
new_train = new_train.withColumn('n', expr('explode(array_repeat(n,int(n)))')).select(['free_text', 'label_id'])
train = new_train

In [5]:
vocab = pickle.load(open('/content/vocab.pkl', 'rb'))
for key in vocab.keys():
    vocab[key] = int(vocab[key])
network = lstmClassifier(vocab_size = len(vocab))

# Build the pytorch object
torch_obj = serialize_torch_obj(
    model=network,
    criterion=nn.BCELoss(),
    optimizer=torch.optim.Adam,
    lr=0.005
)

# Setup features
spark_model = SparkTorch(
    inputCol='features',
    labelCol='label_id',

    predictionCol='predictions',
    torchObj=torch_obj,
    iters=500,
    verbose=1,
    validationPct=0.2,
    miniBatch=64
)

In [6]:
my_transformer = TextTransformer(input_col = 'free_text', output_col= 'features')

assembler_exploded = VectorAssembler(
    inputCols=["features[{}]".format(i) for i in range(50)], 
    outputCol="features"
)
# Demonstration of some options. Not all are required
# Note: This uses the barrier execution mode, which is sensitive to the number of partitions



In [7]:
pip = Pipeline(stages = [my_transformer, assembler_exploded, spark_model])
pip = pip.fit(train)

In [8]:
pip.write().overwrite().save('/content/drive/MyDrive/bigdata/lstm_2classes_2')

# Example of loading the pipeline
loaded_pipeline = PysparkPipelineWrapper.unwrap(PipelineModel.load('/content/drive/MyDrive/bigdata/lstm_2classes_2'))

# Run predictions and evaluation
predictions = loaded_pipeline.transform(train).persist()

evaluator = MulticlassClassificationEvaluator(
    labelCol="label_id", predictionCol="predictions", metricName="accuracy")



In [11]:
test.numPartitions

AttributeError: 'DataFrame' object has no attribute 'numPartitions'

In [6]:
loaded_pipeline = PysparkPipelineWrapper.unwrap(PipelineModel.load('lstm_2classes_2'))


Py4JJavaError: An error occurred while calling o103.partitions.
: java.lang.RuntimeException: Error while running command to get file permissions : java.io.IOException: (null) entry in command string: null ls -F C:\Users\ASUS\Desktop\DA\DA_2classes\lstm_2classes_2\metadata\part-00000
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:773)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:869)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:852)
	at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097)
	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:659)
	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:634)
	at org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:49)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1733)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1713)
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:270)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:567)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:835)

	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:634)
	at org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:49)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1733)
	at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1713)
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:270)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.api.java.JavaRDDLike.partitions(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.JavaRDDLike.partitions$(JavaRDDLike.scala:61)
	at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:45)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:567)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:835)


In [23]:
predictions1 = predictions.withColumn('predictions', when(predictions.predictions > 0.5,1 )\
                                      .otherwise(0))
predictions1 = predictions1.withColumn('predictions', predictions1.predictions.cast(DoubleType()))

In [24]:
accuracy = evaluator.evaluate(predictions1)
print("Train accuracy = %g" % accuracy)

Train accuracy = 0.899992


In [25]:
predictions1.groupBy('predictions').count().show()

+-----------+-----+
|predictions|count|
+-----------+-----+
|        0.0|20389|
|        1.0|16008|
+-----------+-----+



In [26]:
preds = predictions1.toPandas()

In [27]:
import pandas as pd


In [28]:
from sklearn.metrics import classification_report
print(classification_report(preds['label_id'].values, preds['predictions'].values))

              precision    recall  f1-score   support

         0.0       0.90      0.92      0.91     19861
         1.0       0.90      0.87      0.89     16536

    accuracy                           0.90     36397
   macro avg       0.90      0.90      0.90     36397
weighted avg       0.90      0.90      0.90     36397



In [33]:

test_preds = loaded_pipeline.transform(test)

test_preds = test_preds.withColumn('predictions', when(test_preds.predictions > 0.5,1 )\
                                      .otherwise(0))
test_preds = test_preds.withColumn('predictions', test_preds.predictions.cast(DoubleType()))

In [34]:
test_preds = test_preds.toPandas()

In [35]:
print(classification_report(test_preds['label_id'].values, test_preds['predictions'].values))

              precision    recall  f1-score   support

         0.0       0.93      0.88      0.91      5536
         1.0       0.54      0.68      0.60      1123

    accuracy                           0.85      6659
   macro avg       0.74      0.78      0.76      6659
weighted avg       0.87      0.85      0.86      6659



In [None]:
test_preds['predictions'].value_counts()

In [None]:

preds['predictions'].value_counts()