Load labeled data

In [1]:
# Load the input data to be annotated
home_path = "file:///" + os.getcwd() + "/../../../../"
file_path = home_path + \
            "spark-nlp-models/src/main/resources/datasets/training.parquet"
data = spark. \
       read. \
       parquet(file_path)
data.cache()
data.show()
data.count()

+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  0|The Da Vinci Code...|        1|
|  1|this was the firs...|        1|
|  2|i liked the Da Vi...|        1|
|  3|i liked the Da Vi...|        1|
|  4|I liked the Da Vi...|        1|
|  5|that's not even a...|        1|
|  6|I loved the Da Vi...|        1|
|  7|i thought da vinc...|        1|
|  8|The Da Vinci Code...|        1|
|  9|I thought the Da ...|        1|
| 10|The Da Vinci Code...|        1|
| 11|The Da Vinci Code...|        1|
| 12|then I turn on th...|        1|
| 13|The Da Vinci Code...|        1|
| 14|i love da vinci c...|        1|
| 15|i loved da vinci ...|        1|
| 16|TO NIGHT:: THE DA...|        1|
| 17|THE DA VINCI CODE...|        1|
| 18|Thing is, I enjoy...|        1|
| 19|very da vinci cod...|        1|
+---+--------------------+---------+
only showing top 20 rows



7086

In [2]:
from pyspark.sql import functions as F


class SplitData:

    @staticmethod
    def randomSplit(per_train, per_test, data):
        """
        This method splits randomly the data
        :param per_train: percentaje of data to be used as training
        :param per_test: percentaje of data to be used as testing
        :return:
        """
        train_data, test_data = data.randomSplit([per_train, per_test], 
                                                 seed=1234)
        print(f"Training data. Total {train_data.count()}")
        print(f"Testing data. Total {test_data.count()}")
        return train_data, test_data

    @staticmethod
    def filterData(data, label):
        """
        This method filter the data into positive or negative
        sentiment values
        :param data: dataset to filter
        :param label: sentiment label "1" for positive
                      "0" for negative
        """
        if label == "1":
            label_desc = "Positive"
        else:
            label_desc = "Negative"
        data = data.filter(F.col("sentiment") == label)
        print(f"{label_desc} training data. Total {data.count()}")
        data.show(5)
        return data
    
    @staticmethod
    def exportData(data, path, name, header="false"):
        """
        This method exports the data to disk
        :param data: dataset to export
        :param path: path where the file is being exported
        :param name: name of the file
        """
        data.write \
            .format("com.databricks.spark.csv") \
            .option("header", header) \
            .mode("overwrite") \
            .save(path+name)
        print(f"Data {name} exported")

Split the data into training and test sets.

In [3]:
# 80% of the data for training and 20% for testing.
train_data, test_data = SplitData.randomSplit(.8, .2, data)
train_data.show(5)
test_data.show(5)

Training data. Total 5678
Testing data. Total 1408
+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  3|i liked the Da Vi...|        1|
|  4|I liked the Da Vi...|        1|
|  5|that's not even a...|        1|
|  6|I loved the Da Vi...|        1|
|  7|i thought da vinc...|        1|
+---+--------------------+---------+
only showing top 5 rows

+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  0|The Da Vinci Code...|        1|
|  1|this was the firs...|        1|
|  2|i liked the Da Vi...|        1|
| 13|The Da Vinci Code...|        1|
| 26|I really like The...|        1|
+---+--------------------+---------+
only showing top 5 rows



Checking the balance of the datasets

In [4]:
train_positive = SplitData.filterData(train_data, "1")
train_negative = SplitData.filterData(train_data, "0")

Positive training data. Total 3218
+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  3|i liked the Da Vi...|        1|
|  4|I liked the Da Vi...|        1|
|  5|that's not even a...|        1|
|  6|I loved the Da Vi...|        1|
|  7|i thought da vinc...|        1|
+---+--------------------+---------+
only showing top 5 rows

Negative training data. Total 2460
+----+--------------------+---------+
|  id|                text|sentiment|
+----+--------------------+---------+
|3995|da vinci code was...|        0|
|3996|Then again, the D...|        0|
|3999|God, Yahoo Games ...|        0|
|4000|Da Vinci Code doe...|        0|
|4001|And better...-We ...|        0|
+----+--------------------+---------+
only showing top 5 rows



In [5]:
test_positive = SplitData.filterData(test_data, "1")
test_negative = SplitData.filterData(test_data, "0")

Positive training data. Total 777
+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  0|The Da Vinci Code...|        1|
|  1|this was the firs...|        1|
|  2|i liked the Da Vi...|        1|
| 13|The Da Vinci Code...|        1|
| 26|I really like The...|        1|
+---+--------------------+---------+
only showing top 5 rows

Negative training data. Total 631
+----+--------------------+---------+
|  id|                text|sentiment|
+----+--------------------+---------+
|3997|The Da Vinci Code...|        0|
|3998|i thought the da ...|        0|
|4005|And better..-We a...|        0|
|4011|da vinci code suc...|        0|
|4013|not sure if i alr...|        0|
+----+--------------------+---------+
only showing top 5 rows



In [6]:
train_positive = train_positive.limit(2000)
train_positive.show(5)
print(train_positive.count())
train_negative = train_negative.limit(2000)
train_negative.show(5)
print(train_negative.count())

+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  3|i liked the Da Vi...|        1|
|  4|I liked the Da Vi...|        1|
|  5|that's not even a...|        1|
|  6|I loved the Da Vi...|        1|
|  7|i thought da vinc...|        1|
+---+--------------------+---------+
only showing top 5 rows

2000
+----+--------------------+---------+
|  id|                text|sentiment|
+----+--------------------+---------+
|3995|da vinci code was...|        0|
|3996|Then again, the D...|        0|
|3999|God, Yahoo Games ...|        0|
|4000|Da Vinci Code doe...|        0|
|4001|And better...-We ...|        0|
+----+--------------------+---------+
only showing top 5 rows

2000


Creating a balanced training dataset

In [8]:
train_data = train_positive.union(train_negative)
print(train_data.count())
train_data.show(5)

4000
+---+--------------------+---------+
| id|                text|sentiment|
+---+--------------------+---------+
|  3|i liked the Da Vi...|        1|
|  4|I liked the Da Vi...|        1|
|  5|that's not even a...|        1|
|  6|I loved the Da Vi...|        1|
|  7|i thought da vinc...|        1|
+---+--------------------+---------+
only showing top 5 rows



Exporting Data

In [10]:
# Save data to disk
file_path = home_path + \
            "spark-nlp-models/src/main/resources/vivekn/"
train_positive = train_positive.select(F.col("text"))
SplitData.exportData(train_positive, file_path, "training_positive")

train_negative = train_negative.select(F.col("text"))
SplitData.exportData(train_negative, file_path, "training_negative")

file_path = home_path + \
            "spark-nlp-models/src/main/resources/datasets/"
SplitData.exportData(train_data, file_path, "training_balanced", "true")
SplitData.exportData(test_data, file_path, "testing", "true")

Data training_positive exported
Data training_negative exported
Data training_balanced exported
Data testing exported
