In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window


conf = pyspark.SparkConf()
conf.setMaster("local").setAppName("test").set("spark.local.dir", "tmp/spark")
SparkContext.setSystemProperty('spark.driver.memory','16g')
SparkContext.setSystemProperty('spark.executor.memory','16g')
sc = pyspark.SparkContext(conf=conf)
#SparkConf conf = new SparkConf().setMaster("local”).setAppName("test”).set("spark.local.dir", "/tmp/spark-temp");

#sc = SparkContext("local", "App Name")
spark = SQLContext(sc)

In [2]:
#import Bert Token
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

@F.udf("String")
def decode_tokens(tokens):
  return tokenizer.decode(tokens)

In [3]:
#Initialize the type of the schema that the twitter data have
#Create the fields that characterize the data
fields = list()

fields.append(T.StructField("text_tokens",  T.StringType(), True))
fields.append(T.StructField("hashtags", T.StringType(), True))
fields.append(T.StructField("tweet_id", T.StringType(), True))
fields.append(T.StructField("present_media", T.StringType(), True))
fields.append(T.StructField("present_links", T.StringType(), True))
fields.append(T.StructField("present_domains", T.StringType(), True))
fields.append(T.StructField("tweet_type", T.StringType(), True))
fields.append(T.StructField("language", T.StringType(), True))
fields.append(T.StructField("timestamp", T.LongType(), True))
fields.append(T.StructField("engager_user_id", T.StringType(), True))
fields.append(T.StructField("engager_follower_count", T.LongType(), True))
fields.append(T.StructField("engager_following_count", T.LongType(), True))
fields.append(T.StructField("engager_is_verified", T.BooleanType(), True))
fields.append(T.StructField("engager_account_creation_time", T.LongType(), True))
fields.append(T.StructField("engagee_user_id", T.StringType(), True))
fields.append(T.StructField("engagee_follower_count", T.LongType(), True))
fields.append(T.StructField("engagee_following_count", T.LongType(), True))
fields.append(T.StructField("engagee_is_verified", T.BooleanType(), True))
fields.append(T.StructField("engagee_account_creation_time", T.LongType(), True))
fields.append(T.StructField("engagee_follows_engager", T.BooleanType(), True))
fields.append(T.StructField("reply_engagement_timestamp", T.LongType(), True))
fields.append(T.StructField("retweet_engagement_timestamp", T.LongType(), True))
fields.append(T.StructField("retweet_with_comment_engagement_timestamp", T.LongType(), True))
fields.append(T.StructField("like_engagement_timestamp", T.LongType(), True))

#Create the final schema
schema = T.StructType(fields)


In [4]:
#add test path and read input data
test_file_path = "./test/part-*"
#create dataframe with the test dataset
test_df = spark.read.option("sep", chr(1)).schema(schema).csv(test_file_path)
test_df = test_df.withColumn("text_tokens", F.split("text_tokens", "\t"))
test_df = test_df.withColumn("hashtags", F.split("hashtags", "\t"))
test_df = test_df.withColumn("present_media", F.split("present_media", "\t"))
test_df = test_df.withColumn("present_links", F.split("present_links", "\t"))
test_df = test_df.withColumn("present_domains", F.split("present_domains", "\t"))

In [6]:
test_df.count()

13949262

In [6]:
test_df_writing_parquet = "test_parquet_files/test_df.parquet"
test_df.write.parquet(test_df_writing_parquet)

In [7]:
read_test_parquet = "test_parquet_files/test_df.parquet"
test_df =spark.read.parquet(read_test_parquet)

In [8]:
test_engager_user_df = test_df\
.dropDuplicates(["engager_user_id"])\
.select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")\
.withColumn("engager_is_verified", F.col("engager_is_verified").cast(T.IntegerType()))\
.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))
print(test_engager_user_df.count())
#test_engager_user_df.write.parquet("test_parquet_files/test_engager_user_df.parquet")

test_engagee_user_df = test_df\
.dropDuplicates(["engagee_user_id"])\
.select("engagee_user_id", "engagee_follower_count", "engagee_following_count", "engagee_is_verified", "engagee_account_creation_time")\
.withColumn("engagee_is_verified", F.col("engagee_is_verified").cast(T.IntegerType()))\
.withColumn("year_engagee", F.hour(F.to_timestamp("engagee_account_creation_time")))
print(test_engagee_user_df.count())
#test_engagee_user_df.write.parquet("test_parquet_files/test_engagee_user_df.parquet")

3444256
5996896


In [5]:
#create features for test set
test_tweets = test_df.dropDuplicates(["tweet_id"])\
.select("tweet_id",
        decode_tokens(F.col("text_tokens")).alias("text"),
        "tweet_type",
        "language",
        F.hour(F.to_timestamp("timestamp")).alias("hour_tweet"),
        F.size("text_tokens").alias("num_tokens"),
        F.when(F.col("hashtags").isNull(), 0).otherwise(1).alias("has_hashtags"),
        F.when(F.col("present_media").isNull(), 0).otherwise(1).alias("has_media"), 
        F.when(F.col("present_links").isNull(), 0).otherwise(1).alias("has_links"))
print(test_tweets.count())      
tweet_is_in_top_daily_hashtag = test_df\
.dropDuplicates(["tweet_id"])\
.select("tweet_id", "timestamp", F.explode("hashtags").alias("hashtags_exploded"))\
.withColumn("hashtag_duration", F.max("timestamp").over(Window.partitionBy("hashtags_exploded")))\
.withColumn("hashtag_duration", F.col("hashtag_duration") - F.min("timestamp").over(Window.partitionBy("hashtags_exploded")))\
.withColumn("hashtag_duration2", F.col("hashtag_duration") / (24*3600))\
.withColumn("hashtag_duration3", F.round("hashtag_duration2"))\
.withColumn("is_in_top_daily_hashtags", F.when( ((F.col("hashtag_duration3") > 0) & (F.abs(F.col("hashtag_duration3") - F.col("hashtag_duration2")) < 0.007)), 1).otherwise(0))\
.select("tweet_id", "is_in_top_daily_hashtags")\
.groupBy("tweet_id").agg(F.max("is_in_top_daily_hashtags").alias("is_in_top_daily_hashtags"))

test_tweets = test_tweets.join(tweet_is_in_top_daily_hashtag, "tweet_id", "left_outer")\
.withColumn("is_in_top_daily_hashtags", F.when(F.col("is_in_top_daily_hashtags").isNull(), 0).otherwise(F.col("is_in_top_daily_hashtags")))

tweet_is_in_top_daily_link = test_df\
.dropDuplicates(["tweet_id"])\
.select("tweet_id", "timestamp", F.explode("present_links").alias("present_links_exploded"))\
.withColumn("link_duration", F.max("timestamp").over(Window.partitionBy("present_links_exploded")))\
.withColumn("link_duration", F.col("link_duration") - F.min("timestamp").over(Window.partitionBy("present_links_exploded")))\
.withColumn("link_duration2", F.col("link_duration") / (24*3600))\
.withColumn("link_duration3", F.round("link_duration2"))\
.withColumn("is_in_top_daily_links", F.when( ((F.col("link_duration3") > 0) & (F.abs(F.col("link_duration3") - F.col("link_duration2")) < 0.007)), 1).otherwise(0))\
.select("tweet_id", "is_in_top_daily_links")\
.groupBy("tweet_id").agg(F.max("is_in_top_daily_links").alias("is_in_top_daily_links"))

test_tweets = test_tweets.join(tweet_is_in_top_daily_link, "tweet_id", "left_outer")\
.withColumn("is_in_top_daily_links", F.when(F.col("is_in_top_daily_links").isNull(), 0).otherwise(F.col("is_in_top_daily_links")))

9883735


In [10]:
test_data = test_df\
.select("tweet_id", "engager_user_id", "engagee_user_id")

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler

create_tweet_features = PipelineModel.load("./models/create_tweets_features.model")
create_engager_user_features = PipelineModel.load("./models/create_engager_user_features.model")
create_engagee_user_features = PipelineModel.load("./models/create_engagee_user_features.model")

test_features = test_tweets
test_features = create_tweet_features.transform(test_features).select("tweet_id", "tweet_features")

engager_features = spark.read.parquet("test_parquet_files/test_engager_user_df.parquet")
engager_features = create_engager_user_features.transform(engager_features).select("engager_user_id", "engager_features")

engagee_features = spark.read.parquet("test_parquet_files/test_engagee_user_df.parquet")
engagee_features = create_engagee_user_features.transform(engagee_features).select("engagee_user_id", "engagee_features")

test_data = test_data.join(test_features, "tweet_id")
test_data = test_data.join(engager_features, "engager_user_id")
test_data = test_data.join(engagee_features, "engagee_user_id")

assemblerInputs = ["tweet_features", "engager_features", "engagee_features"]
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="featuresAssembled")



In [11]:
test_data = assembler.transform(test_data).drop(*assemblerInputs)
test_data.write.parquet("test_parquet_files/test_dataset.parquet")

In [12]:
#load test_dataset
test_dataset = spark.read.parquet("test_parquet_files/test_dataset.parquet")

In [13]:
from pyspark.ml.classification import GBTClassificationModel
gbt_like_model =  GBTClassificationModel.load("models/gbt_like.model")

In [15]:
from pyspark.ml.classification import GBTClassificationModel
print("Making predictions for like...")
predictions = gbt_like_model.transform(test_dataset)

print("Predictions are being written in csv file...")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv("test_predictions_files/like.csv")

Making predictions for like...
Predictions are being written in csv file...


In [3]:
predictions_like = spark.read.csv("test_predictions_files/like.csv")
predictions_like.show(100)

+--------------------+--------------------+-------------------+
|                 _c0|                 _c1|                _c2|
+--------------------+--------------------+-------------------+
|59475C01C8B9F8964...|0001057408B8B24BD...| 0.5395706706062846|
|5F47BD12DA14AB9FF...|0001057408B8B24BD...|0.47599656700071924|
|550CE57D9CA9A8FC1...|0001057408B8B24BD...|0.22371996181139742|
|754F4A9754B2CD766...|0001057408B8B24BD...|  0.245434825399809|
|F7C6823EC8D48065C...|0001057408B8B24BD...|0.46358789569530734|
|A1651626282B4FEAC...|0001057408B8B24BD...|0.21622379629942456|
|7AB462331770CF368...|0001057408B8B24BD...| 0.5033063543520688|
|C424FF72D8A56F98A...|0001057408B8B24BD...| 0.5477760789227731|
|01EAD76DEEAF91C71...|0001057408B8B24BD...|0.49880333563121015|
|CF02126F8AD1EAE2E...|0001057408B8B24BD...| 0.5121878012911105|
|BB7D4F04D8B8A8341...|0001057408B8B24BD...| 0.5555063685291564|
|61A9B87B7A7F1C216...|0001057408B8B24BD...| 0.5708090673703047|
|A81CE4D7E3D82EC3A...|00093621FF2283CBB.

In [17]:
gbt_reply_model =  GBTClassificationModel.load("models/gbt_reply.model")
print("Making predictions for reply...")
predictions = gbt_reply_model.transform(test_dataset)

print("Predictions are being written in csv file...")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv("test_predictions_files/reply.csv")

Making predictions for reply...
Predictions are being written in csv file...


In [4]:
predictions_reply = spark.read.csv("test_predictions_files/reply.csv")
predictions_reply.show(100)

+--------------------+--------------------+--------------------+
|                 _c0|                 _c1|                 _c2|
+--------------------+--------------------+--------------------+
|59475C01C8B9F8964...|0001057408B8B24BD...| 0.05796429788268742|
|5F47BD12DA14AB9FF...|0001057408B8B24BD...|0.055467096582995645|
|550CE57D9CA9A8FC1...|0001057408B8B24BD...| 0.04623972807742993|
|754F4A9754B2CD766...|0001057408B8B24BD...| 0.04645305766176244|
|F7C6823EC8D48065C...|0001057408B8B24BD...| 0.05469762505354692|
|A1651626282B4FEAC...|0001057408B8B24BD...| 0.04672436889057241|
|7AB462331770CF368...|0001057408B8B24BD...| 0.06284515585805484|
|C424FF72D8A56F98A...|0001057408B8B24BD...|0.058103311131081026|
|01EAD76DEEAF91C71...|0001057408B8B24BD...| 0.05734985959532435|
|CF02126F8AD1EAE2E...|0001057408B8B24BD...|0.054658887220938035|
|BB7D4F04D8B8A8341...|0001057408B8B24BD...| 0.05677679322769913|
|61A9B87B7A7F1C216...|0001057408B8B24BD...|0.059096363224597104|
|A81CE4D7E3D82EC3A...|000

In [19]:
gbt_retweet_model =  GBTClassificationModel.load("models/gbt_retweet.model")
print("Making predictions for retweet...")
predictions = gbt_retweet_model.transform(test_dataset)

print("Predictions are being written in csv file...")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv("test_predictions_files/retweet.csv")

Making predictions for retweet...
Predictions are being written in csv file...


In [5]:
predictions_retweet = spark.read.csv("test_predictions_files/retweet.csv")
predictions_retweet.show(100)

+--------------------+--------------------+-------------------+
|                 _c0|                 _c1|                _c2|
+--------------------+--------------------+-------------------+
|59475C01C8B9F8964...|0001057408B8B24BD...|0.08111069005287586|
|5F47BD12DA14AB9FF...|0001057408B8B24BD...|0.11831037626733631|
|550CE57D9CA9A8FC1...|0001057408B8B24BD...|0.10017932886448844|
|754F4A9754B2CD766...|0001057408B8B24BD...|0.10992152549977063|
|F7C6823EC8D48065C...|0001057408B8B24BD...|0.10144575097656239|
|A1651626282B4FEAC...|0001057408B8B24BD...|0.08876257560354839|
|7AB462331770CF368...|0001057408B8B24BD...|  0.084271423585077|
|C424FF72D8A56F98A...|0001057408B8B24BD...|0.08952225273779368|
|01EAD76DEEAF91C71...|0001057408B8B24BD...|0.08970868267594356|
|CF02126F8AD1EAE2E...|0001057408B8B24BD...|0.14604565641748357|
|BB7D4F04D8B8A8341...|0001057408B8B24BD...|0.09136269431609212|
|61A9B87B7A7F1C216...|0001057408B8B24BD...|0.09404112783251162|
|A81CE4D7E3D82EC3A...|00093621FF2283CBB.

In [21]:
gbt_rtWithCmt_model =  GBTClassificationModel.load("models/gbt_rtWithCmt.model")
print("Making predictions for retweet with comment...")
predictions = gbt_rtWithCmt_model.transform(test_dataset)

print("Predictions are being written in csv file...")
split1_udf = F.udf(lambda value: value[1].item(), T.DoubleType())
predictions.select("tweet_id", "engagee_user_id", split1_udf("probability").alias("probability"))\
.coalesce(1).write.csv("test_predictions_files/rtWithCmt.csv")

Making predictions for retweet with comment...
Predictions are being written in csv file...


In [6]:
predictions_rtWithCmt = spark.read.csv("test_predictions_files/rtWithCmt.csv")
predictions_rtWithCmt.show(100)

+--------------------+--------------------+--------------------+
|                 _c0|                 _c1|                 _c2|
+--------------------+--------------------+--------------------+
|59475C01C8B9F8964...|0001057408B8B24BD...| 0.04693636266159318|
|5F47BD12DA14AB9FF...|0001057408B8B24BD...|0.047482878986417454|
|550CE57D9CA9A8FC1...|0001057408B8B24BD...|0.046048264066535394|
|754F4A9754B2CD766...|0001057408B8B24BD...|0.047167649112566434|
|F7C6823EC8D48065C...|0001057408B8B24BD...|0.048171978141932636|
|A1651626282B4FEAC...|0001057408B8B24BD...| 0.04558320791594983|
|7AB462331770CF368...|0001057408B8B24BD...| 0.04733856461467778|
|C424FF72D8A56F98A...|0001057408B8B24BD...| 0.04723648172851658|
|01EAD76DEEAF91C71...|0001057408B8B24BD...| 0.04733856461467778|
|CF02126F8AD1EAE2E...|0001057408B8B24BD...| 0.04779838156536054|
|BB7D4F04D8B8A8341...|0001057408B8B24BD...|0.047727615378849175|
|61A9B87B7A7F1C216...|0001057408B8B24BD...| 0.04693636266159318|
|A81CE4D7E3D82EC3A...|000

In [7]:
temp_df = predictions_reply

In [8]:
temp_df = temp_df.withColumnRenamed("_c0", "Tweet_id")
temp_df = temp_df.withColumnRenamed("_c1", "User_id")
temp_df = temp_df.withColumnRenamed("_c2", "prediction_reply")
#temp_df.show()

In [9]:
cond  = [((temp_df.Tweet_id == predictions_retweet._c0) &\
         (temp_df.User_id == predictions_retweet._c1))]
joined_df = temp_df.join(predictions_retweet, on=cond, how="inner")
#joined_df = joined_df.dropDuplicates()
#joined_df.show()

In [10]:
joined_df = joined_df.drop("_c0")
joined_df = joined_df.drop("_c1")
joined_df = joined_df.withColumnRenamed("_c2", "prediction_retweet")

In [27]:
#joined_df.show(20)

In [11]:
cond  = [((joined_df.Tweet_id == predictions_rtWithCmt._c0) &\
         (joined_df.User_id == predictions_rtWithCmt._c1))]
joined_df = joined_df.join(predictions_rtWithCmt, on=cond, how="inner")
#joined_df = joined_df.dropDuplicates()
joined_df = joined_df.drop("_c0")
joined_df = joined_df.drop("_c1")
joined_df = joined_df.withColumnRenamed("_c2", "prediction_quote")

In [74]:
#joined_df.show(20)

+--------------------+--------------------+--------------------+-------------------+--------------------+
|            Tweet_id|             User_id|    prediction_reply| prediction_retweet|    prediction_quote|
+--------------------+--------------------+--------------------+-------------------+--------------------+
|000129F2BF0B74A1E...|4339301AC5BFC3F37...| 0.04800772672496356|0.10749922788703603|0.047173736570948144|
|000191EA7911E5B43...|3523C34B140D508F0...| 0.04618672299705029|0.19187498406085923| 0.04735638672020226|
|0001D475BBCCE211B...|FE196989470A0CA72...| 0.04800772672496356|0.10706793820421268| 0.04735638672020226|
|00048BD1BE4209415...|A32EC08B0064A7EC1...| 0.04800772672496356|0.11181933688732715|0.047173736570948144|
|000515EFC361198B6...|5B87D45BE5300C4B6...| 0.07663721560136882|0.08934169359495447| 0.04772822858762549|
|00089985F7A8EEDEB...|088E0EF86D4D73AE3...| 0.06617818859379576|0.13512003209771428| 0.04671832016479838|
|0008B9FB982F50801...|2BABECE50A8C09E5A...| 0.

In [12]:
cond  = [((joined_df.Tweet_id == predictions_like._c0) &\
         (joined_df.User_id == predictions_like._c1))]
joined_df = joined_df.join(predictions_like, on=cond, how="inner")
#joined_df = joined_df.dropDuplicates()
joined_df = joined_df.drop("_c0")
joined_df = joined_df.drop("_c1")
joined_df = joined_df.withColumnRenamed("_c2", "prediction_fav")

In [13]:
joined_df.show(20)

+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|            Tweet_id|             User_id|    prediction_reply| prediction_retweet|    prediction_quote|     prediction_fav|
+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|000428396565FF0FD...|96EC3459C793AD540...|  0.0876354779202565| 0.0724630243020643| 0.04729374436278311| 0.4900845901724279|
|00042B40304094BBA...|E95926324937C72ED...| 0.06284515585805484|0.08419566813899437| 0.04733010256247461| 0.5104820632819627|
|0005531EBD7B938A0...|BFC149E9339C059B6...| 0.07187680082067782|0.09567667761808751| 0.04681001576063082| 0.4810394547809559|
|00066DA903B3D1890...|285ADC2746EFE928F...| 0.08046115835356793|0.07675248000203094| 0.04700271728404104| 0.4884996557510254|
|000685FFA6E6BA272...|1A6DEF48D8FA45D9B...|0.056257534711669055| 0.0910144254667512|0.047779154315225836| 0.2573794806

In [37]:
joined_df.show(20)

+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|            Tweet_id|             User_id|    prediction_reply| prediction_retweet|    prediction_quote|     prediction_fav|
+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|000428396565FF0FD...|96EC3459C793AD540...|  0.0876354779202565| 0.0724630243020643| 0.04729374436278311| 0.4900845901724279|
|00042B40304094BBA...|E95926324937C72ED...| 0.06284515585805484|0.08419566813899437| 0.04733010256247461| 0.5104820632819627|
|0005531EBD7B938A0...|BFC149E9339C059B6...| 0.07187680082067782|0.09567667761808751| 0.04681001576063082| 0.4810394547809559|
|00066DA903B3D1890...|285ADC2746EFE928F...| 0.08046115835356793|0.07675248000203094| 0.04700271728404104| 0.4884996557510254|
|000685FFA6E6BA272...|1A6DEF48D8FA45D9B...|0.056257534711669055| 0.0910144254667512|0.047779154315225836| 0.2573794806

In [38]:
print("Final Predictions are being written in csv file...")
joined_df.write.csv("test_results.csv")

Final Predictions are being written in csv file...


In [3]:
final_csv = spark.read.csv("test_results.csv")
final_csv.show(20)

+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|                 _c0|                 _c1|                 _c2|                _c3|                 _c4|                _c5|
+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|000243298D7BAC319...|A89CE1B865A6F6392...| 0.04645305766176244|0.11202179478489949| 0.04735638672020226|0.25432497491757833|
|00025F213DF4A875F...|D724BFAE8127B1A84...| 0.05816019898645741|0.11882749945144944|0.047402793595806014|0.48996733719125296|
|0002CFA949BB59468...|DC42EF2C70ECB645F...| 0.06182829866857009|0.08335639091388658| 0.04718004882701643|0.49904730942565245|
|0004648E7A56729A1...|A2213022B45858FD0...| 0.05848490623411706|0.08280405981451766|  0.0474457448179616| 0.5047309421412041|
|0006AE9DCE7FFE0A2...|3354DC2062B6F0FB2...| 0.05677679322769913|0.09133301298484142| 0.04761532336920227| 0.5598468301

In [82]:
final_csv = spark.read.csv("final_predictions.csv")
final_csv.show(20)

+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|                 _c0|                 _c1|                 _c2|                _c3|                 _c4|                _c5|
+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+
|0002A30185D204089...|F023DD79369EBFA62...| 0.04621495333583381|0.11137461643942104|  0.0468412697154349|0.28004691260405146|
|0002AC744F56A42DB...|237A2A9737CD463E3...| 0.05972001725515075|0.09442716624353065| 0.04710385096395053| 0.5198177994020081|
|000370453E99CA997...|60F2A07CB900AA316...|  0.0464273056590937|0.10961205634752147| 0.04735638672020226|0.34370923922945584|
|0003E60DA8F1E79FA...|760D200824C1950E0...| 0.05714345557718059|0.08469361891012006| 0.04752055404377131| 0.2296259431384451|
|0004FEB5088C5E9AC...|CCC502A9FDC5978AF...| 0.05996951605419665| 0.0910144254667512|0.047211874458035985| 0.5195418139

In [15]:
print("Final Predictions are being written in csv file...")
joined_df.coalesce(1).write.csv('result.csv')

Final Predictions are being written in csv file...
