In [1]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window


from pyspark import SparkContext

SparkContext.setSystemProperty('spark.executor.memory', '16g')

sc = SparkContext("local", "App Name")
spark = SQLContext(sc)

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Row
from pyspark.sql.window import Window


conf = pyspark.SparkConf()
conf.setMaster("local").setAppName("test").set("spark.local.dir", "tmp/spark")
SparkContext.setSystemProperty('spark.driver.memory','16g')
SparkContext.setSystemProperty('spark.executor.memory','16g')
sc = pyspark.SparkContext(conf=conf)
#SparkConf conf = new SparkConf().setMaster("local”).setAppName("test”).set("spark.local.dir", "/tmp/spark-temp");

#sc = SparkContext("local", "App Name")
spark = SQLContext(sc)

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

@F.udf("String")
def decode_tokens(tokens):
  return tokenizer.decode(tokens)

In [3]:
read_training_parquet = "parquet_files/training_df.parquet"
training_df = spark.read.parquet(read_training_parquet)

In [4]:
#prepare training dataset
training_engager_user_df = training_df\
.dropDuplicates(["engager_user_id"])\
.select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")\
.withColumn("engager_is_verified", F.col("engager_is_verified").cast(T.IntegerType()))\
.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))

In [5]:
training_engager_user_df.count()

24064181

In [5]:
training_engager_user_df_parquet = ("parquet_files/training_engager_user_df.parquet")
training_engager_user_df.write.parquet(training_engager_user_df_parquet)

In [6]:
training_engagee_user_df = training_df\
.dropDuplicates(["engagee_user_id"])\
.select("engagee_user_id", "engagee_follower_count", "engagee_following_count", "engagee_is_verified", "engagee_account_creation_time")\
.withColumn("engagee_is_verified", F.col("engagee_is_verified").cast(T.IntegerType()))\
.withColumn("year_engagee", F.hour(F.to_timestamp("engagee_account_creation_time")))

In [7]:
training_engagee_user_df.count()

34287431

In [7]:
training_engagee_user_df_parquet = ("parquet_files/training_engagee_user_df.parquet")
training_engagee_user_df.write.parquet(training_engagee_user_df_parquet)

In [8]:
from pyspark.ml.feature import OneHotEncoder,VectorAssembler, StringIndexer, FeatureHasher, QuantileDiscretizer
from pyspark.ml import Pipeline

In [9]:
#engager features
stages = []
numericalColumns = ["engager_follower_count", "engager_following_count"]
for numericalCol in numericalColumns:
    qd = QuantileDiscretizer(numBuckets=50, handleInvalid="keep", inputCol=numericalCol, outputCol=numericalCol + "Bucket")
    encoder = OneHotEncoder(inputCols=[qd.getOutputCol()], outputCols=[numericalCol + "classVec"])
    stages += [qd, encoder]

categoricalColumns = ["year_engager", "engager_is_verified"]
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]


featInputs = [c + "Bucket" for c in numericalColumns]
featInputs += ["year_engagerIndex", "engager_is_verifiedIndex"]
stages += [FeatureHasher(numFeatures=16, inputCols=featInputs, outputCol="engager_features", categoricalCols=featInputs)]


engager_user_features = Pipeline(stages=stages).fit(training_engager_user_df)
#engager_user_features.transform(training_engager_user_df).select("engager_user_id", "engager_features").show()
    

In [10]:
#engagee features 

stages = []
numericalColumns = ["engagee_follower_count", "engagee_following_count"]
for numericalCol in numericalColumns:
    qd = QuantileDiscretizer(numBuckets=50, handleInvalid="keep", inputCol=numericalCol, outputCol=numericalCol + "Bucket")
    encoder = OneHotEncoder(inputCols=[qd.getOutputCol()], outputCols=[numericalCol + "classVec"])
    stages += [qd, encoder]

categoricalColumns = ["year_engagee", "engagee_is_verified"]
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

featInputs = [c + "Bucket" for c in numericalColumns]
featInputs += ["year_engageeIndex", "engagee_is_verifiedIndex"]
stages += [FeatureHasher(numFeatures=16, inputCols=featInputs, outputCol="engagee_features", categoricalCols=featInputs)]


engagee_user_features = Pipeline(stages=stages).fit(training_engagee_user_df)

    

In [13]:
engagee_user_features.transform(training_engagee_user_df).show()

+--------------------+----------------------+-----------------------+-------------------+-----------------------------+------------+----------------------------+------------------------------+-----------------------------+-------------------------------+-----------------+--------------------+------------------------+---------------------------+--------------------+
|     engagee_user_id|engagee_follower_count|engagee_following_count|engagee_is_verified|engagee_account_creation_time|year_engagee|engagee_follower_countBucket|engagee_follower_countclassVec|engagee_following_countBucket|engagee_following_countclassVec|year_engageeIndex|year_engageeclassVec|engagee_is_verifiedIndex|engagee_is_verifiedclassVec|    engagee_features|
+--------------------+----------------------+-----------------------+-------------------+-----------------------------+------------+----------------------------+------------------------------+-----------------------------+-------------------------------+----------

In [11]:
#save feature models in drive
engager_user_features.save("models/create_engager_user_features.model")
engagee_user_features.save("models/create_engagee_user_features.model")

In [12]:
engagee_user_features.transform(training_engagee_user_df).show()

+--------------------+----------------------+-----------------------+-------------------+-----------------------------+------------+----------------------------+------------------------------+-----------------------------+-------------------------------+-----------------+--------------------+------------------------+---------------------------+--------------------+
|     engagee_user_id|engagee_follower_count|engagee_following_count|engagee_is_verified|engagee_account_creation_time|year_engagee|engagee_follower_countBucket|engagee_follower_countclassVec|engagee_following_countBucket|engagee_following_countclassVec|year_engageeIndex|year_engageeclassVec|engagee_is_verifiedIndex|engagee_is_verifiedclassVec|    engagee_features|
+--------------------+----------------------+-----------------------+-------------------+-----------------------------+------------+----------------------------+------------------------------+-----------------------------+-------------------------------+----------

In [8]:
#read validation_dataset
read_validation_parquet = "parquet_files/validation_df.parquet"
validation_df =spark.read.parquet(read_validation_parquet)

In [9]:
validation_engager_user_df = validation_df\
.dropDuplicates(["engager_user_id"])\
.select("engager_user_id", "engager_follower_count", "engager_following_count", "engager_is_verified", "engager_account_creation_time")\
.withColumn("engager_is_verified", F.col("engager_is_verified").cast(T.IntegerType()))\
.withColumn("year_engager", F.hour(F.to_timestamp("engager_account_creation_time")))
print(validation_engager_user_df.count())
#validation_engager_user_df.write.parquet("/media/user2/2.0 TB Hard Disk/parquet_files/validation_engager_user_df.parquet")

validation_engagee_user_df = validation_df\
.dropDuplicates(["engagee_user_id"])\
.select("engagee_user_id", "engagee_follower_count", "engagee_following_count", "engagee_is_verified", "engagee_account_creation_time")\
.withColumn("engagee_is_verified", F.col("engagee_is_verified").cast(T.IntegerType()))\
.withColumn("year_engagee", F.hour(F.to_timestamp("engagee_account_creation_time")))
print(validation_engagee_user_df.count())

#validation_engagee_user_df.write.parquet("/media/user2/2.0 TB Hard Disk/parquet_files/validation_engagee_user_df.parquet")

3527270
6101247
