In [1]:
import helper_daniel as Helper

In [9]:
import pyspark as ps
import json
import pandas as pd

spark = ps.sql.SparkSession.builder.master("local[4]").appName('df case study').getOrCreate()

sc = ps.SparkContext

tweets_df = spark.read.json('./data/french_tweets.json').sample(False, 0.02)

In [10]:
def CleanUp(rdd, columns):
    return (rdd[columns].withColumn('user_id', rdd['user.id'])
            .withColumn('user_screen_name', rdd['user.screen_name'])
            .withColumn('user_name', rdd['user.name'])
            .withColumn('place_type', rdd['place.place_type'])
            .withColumn('place_coordinates', rdd['place.bounding_box.coordinates'])
            .withColumn('hashtags', rdd['entities.hashtags.text'])
            .withColumn('mentions_id', rdd['entities.user_mentions.id'])
            .drop('user')
            .drop('place')
            .drop('entities')
           )


cleaned = CleanUp(tweets_df, ['entities', 'filter_level', 'id', 'in_reply_to_user_id', 'lang', 'place', 'possibly_sensitive', 'text', 'timestamp_ms', 'user'])


In [11]:

cleaned = CleanUp(tweets_df, ['entities', 'filter_level', 'id', 'in_reply_to_user_id', 'lang', 'place', 'possibly_sensitive', 'text', 'timestamp_ms', 'user'])



In [12]:
cleaned.createOrReplaceTempView("dataset")

In [13]:
cleaned.show(1, truncate=False)    

+------------+------------------+-------------------+----+------------------+-------------------------------------------------------------------------------------------------------------------------------------------+-------------+---------+----------------+---------+----------+--------------------------------------------------------------------------------------------+--------+-----------------------------------------------------+
|filter_level|id                |in_reply_to_user_id|lang|possibly_sensitive|text                                                                                                                                       |timestamp_ms |user_id  |user_screen_name|user_name|place_type|place_coordinates                                                                           |hashtags|mentions_id                                          |
+------------+------------------+-------------------+----+------------------+---------------------------------------------------

In [14]:
cleaned.printSchema()

root
 |-- filter_level: string (nullable = true)
 |-- id: long (nullable = true)
 |-- in_reply_to_user_id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- possibly_sensitive: boolean (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp_ms: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_screen_name: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- place_type: string (nullable = true)
 |-- place_coordinates: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: double (containsNull = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- mentions_id: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [16]:
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [19]:
le_pen_id = 217749896
macron_id = 1976143068

le_pen_support = -1
macron_support = 1

def GetSupport(col):
    if le_pen_id in col and macron_id in col:
        return 0
    elif le_pen_id in col:
        return le_pen_support
    elif macron_id in col:
        return macron_support
    else:
        return 0
    pass

udf_func = udf(GetSupport, IntegerType())
cleaned = cleaned.withColumn('supports', udf_func(F.col('mentions_id')))
no_tags = cleaned.withColumn('amount_hashtags', F.size(F.col('hashtags'))).filter(F.col('amount_hashtags') > 0)

In [20]:
no_tags.select('hashtags').show(10)

+--------------------+
|            hashtags|
+--------------------+
|   [EcoleTwittosEMH]|
|            [Macron]|
|[LePen, Doyens, F...|
|  [LePen, Photoshop]|
|            [macron]|
|    [winitwednesday]|
|    [jecoutenrjlyon]|
|    [jecoutenrjlyon]|
|         [whirlpool]|
|        [Luxembourg]|
+--------------------+
only showing top 10 rows

