In [18]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("example") \
    .getOrCreate()

# Practicing creating a UDF

You have two objectives to fulfill:

- Ensure that the transformed data consists of nonempty vectors.
- A dataframe has a column that contains arrays of string, where each array has a single item. You'd like to transform this column to a string.

In [21]:
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.sql.functions import udf
# Returns true if the value is a nonempty vector
nonempty_udf = udf(lambda x:  
    True if (x and hasattr(x, "toArray") and x.numNonzeros())
    else False, BooleanType())

# Returns first element of the array as string
s_udf = udf(lambda x: str(x[0]) if (x and type(x) is list and len(x) > 0)
    else '', StringType())

# Practicing array column

The `TRIVIAL_TOKENS` variable is a set. It contains certain words that we want to remove.

In [26]:
digits = set(str(i) for i in range(10))  # Set of digits from 0 to 9
characters = set(chr(i) for i in range(ord('a'), ord('z')+1)) - {'a', 'd', 'i'}  # Set of characters excluding 'a', 'd', 'i'

# Combine the sets
TRIVIAL_TOKENS = digits.union(characters)
df_before = spark.read.csv("dataset/sherlock.txt")
df_before.show(3)

+--------------------+
|                 _c0|
+--------------------+
|The Project Guten...|
|by Sir Arthur Con...|
|(#15 in our serie...|
+--------------------+
only showing top 3 rows



In [27]:
# # Show the rows where doc contains the item '5'
# df_before.where(array_contains('doc', '5')).show()

# # UDF removes items in TRIVIAL_TOKENS from array
# rm_trivial_udf = udf(lambda x:
#                      list(set(x) - TRIVIAL_TOKENS) if x
#                      else x,
#                      ArrayType(StringType()))

# # Remove trivial tokens from 'in' and 'out' columns of df2
# df_after = df_before.withColumn('in', rm_trivial_udf('in'))\
#                     .withColumn('out', rm_trivial_udf('out'))

# # Show the rows of df_after where doc contains the item '5'
# df_after.where(array_contains('doc','5')).show()

# Creating a UDF for vector data

A dataframe df is available, having a column output of type vector

In [28]:
# # Selects the first element of a vector column
# first_udf = udf(lambda x:
#             float(x.indices[0]) 
#             if (x and hasattr(x, "toArray") and x.numNonzeros()) 
#             else 0.0,
#             FloatType())

# # Apply first_udf to the output column
# df.select(first_udf("output").alias("result")).show(5)

# Applying a UDF to vector data

A dataframe is available called df having a column output of type vector. Its first five rows are shown in the console.

In [29]:
# # Add label by applying the get_first_udf to output column
# df_new = df.withColumn('label', get_first_udf('output'))

# # Show the first five rows 
# df_new.show(5)

# Transforming text to vector format

You learned how to split sentences and transform an array of words into a numerical vector using a `CountVectorizer`.You will first perform a transform that adds an invec column

In [30]:
# # Transform df using model
# result = model.transform(df.withColumnRenamed('in', 'words'))\
#         .withColumnRenamed('words', 'in')\
#         .withColumnRenamed('vec', 'invec')
# result.drop('sentence').show(3, False)

# # Add a column based on the out column called outvec
# result = model.transform(result.withColumnRenamed('out', 'words'))\
#         .withColumnRenamed('words', 'out')\
#         .withColumnRenamed('vec', 'outvec')
# result.select('invec', 'outvec').show(3, False)	

# Label the data

A dataframe df is available having columns endword: string, features: vector, and outvec: vector. You are to select the rows where endword equals "him", and add a column label having the integer value 1. Then, use the union operation to add an equal number of rows having endword not equals to him, such that these additional rows have label = 0.

In [33]:
# # Import the lit function
# from pyspark.sql.functions import lit

# # Select the rows where endword is 'him' and label 1
# df_pos = df.where("endword = 'him'")\
#            .withColumn('label', lit(1))

# # Select the rows where endword is not 'him' and label 0
# df_neg = df.where("endword <> 'him'")\
#            .withColumn('label', lit(0))

# # Union pos and neg in equal number
# df_examples = df_pos.union(df_neg.limit(df_pos.count()))
# print("Number of examples: ", df_examples.count())
# df_examples.where("endword <> 'him'").sample(False, .1, 42).show(5)

# Split the data

A dataframe df_examples is available having columns endword: string, features: vector, outvec: vector, and label: int. You're going to split it to obtain training and testing set, which you will use to train and test a classifier.

In [34]:
# # Split the examples into train and test, use 80/20 split
# df_trainset, df_testset = df_examples.randomSplit((.8,.2), 42)

# # Print the number of training examples
# print("Number training: ", df_trainset.count())

# # Print the number of test examples
# print("Number test: ", df_testset.count())

# Train the classifier

The dataframe df_trainset you created in the previous exercise is available. You're now going to use it to train a Logistic Regression Classifier.

In [35]:
# # Import the logistic regression classifier
# from pyspark.ml.classification import LogisticRegression

# # Instantiate logistic setting elasticnet to 0.0
# logistic = LogisticRegression(maxIter=100, regParam=0.4, elasticNetParam=0.0)

# # Train the logistic classifer on the trainset
# df_fitted = logistic.fit(df_trainset)

# # Print the number of training iterations
# print("Training iterations: ", df_fitted.summary.totalIterations)

# Evaluate the classifier

A trained logistic regression model df_fitted is available. A dataframe df_testset is available containing test data for this model.

In [36]:
# # Score the model on test data
# testSummary = df_fitted.evaluate(df_testset)

# # Print the AUC metric
# print("\ntest AUC: %.3f" % testSummary.areaUnderROC)

# Predict test data

A fitted logistic model df_fitted is available. A dataframe df_testset is available containing test data for this model. A variable fields is available, containing the list ['prediction', 'label', 'endword', 'doc', 'probability']; this is used to specify which prediction fields to print.

In [37]:
# fields = ['prediction', 'label', 'endword', 'doc', 'probability']

# # Apply the model to the test data
# predictions = df_fitted.transform(df_testset).select(fields)

# # Print incorrect if prediction does not match label
# for x in predictions.take(8):
#     print()
#     if x.label != int(x.prediction):
#         print("INCORRECT ==> ")
#     for y in fields:
#         print(y,":", x[y])