In [None]:
from tweepy.streaming import StreamListener
from tweepy import Stream
from tweepy import OAuthHandler

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F

import json
import time

class Tweet():
    """
    Objeto com os atributos refrentes aos dados a serem extraídos do JSON da API do Twitter
    """
    tweet_id: str = None
    created_at: str = None
    text: str = None
    hashtags: str = None
    retweet_count: int = None
    possibly_sensitive: bool = None
    lang: str = None
    user_id: str = None
    user_name: str = None
    user_description: str = None
    user_verification: bool = None
    user_followers_count: int = None
    user_friends_count: int = None
    user_created_at: str = None
    user_location: str = None
    
    def get_list(self):
        """
        Retorna uma lista com os valores atribuídos ao objeto instanciado
        """
        list_data = [
            self.tweet_id,
            self.created_at,
            self.text,
            self.hashtags,
            self.retweet_count,
            self.possibly_sensitive,
            self.lang,
            self.user_id,
            self.user_name,
            self.user_description,
            self.user_verification,
            self.user_followers_count,
            self.user_friends_count,
            self.user_created_at,
            self.user_location,
        ]
        return list_data
    
    def insert(self, data):
        """
        Recebe o retorno da API e insere os dados em seus respectivos atributos
        """
        hashtags = ""
        if "extended_tweet" in data:
            for h in data['extended_tweet']["entities"]["hashtags"]:
                hashtags = h["text"] + ", " + hashtags
        else:
            for h in data["entities"]["hashtags"]:
                hashtags = h["text"] + ", " + hashtags
        self.tweet_id = data["id"]
        self.created_at = data["created_at"]
        self.text = data['extended_tweet']['full_text'] if "extended_tweet" in data else data["text"]
        self.user_id = data["user"]["id"]
        self.hashtags = hashtags
        self.retweet_count = data["retweet_count"]
        self.possibly_sensitive = data['possibly_sensitive'] if "possibly_sensitive" in data else None
        self.lang = data["lang"]
        self.user_id = data["user"]["id"]
        self.user_name = data["user"]["name"]
        self.user_description = data["user"]["description"]
        self.user_verification = data["user"]["verified"]
        self.user_followers_count = data["user"]["followers_count"]
        self.user_friends_count = data["user"]["friends_count"]
        self.user_created_at = data["user"]["created_at"]
        self.user_location = data["user"]["location"]

class TwitterListener(StreamListener):
    """
    Listener da API do Twitter, esta classe realiza o streaming de tweets,
    recebendo os tweets, processando e persistindo em uma temp view.
    """
    def __init__(self, persist_time):
        self.persist_time = persist_time
        self.start = time.time()
        self.listTweets = []
        self.schema = StructType([StructField("tweet_id", StringType(), True),
                                  StructField("created_at", StringType(), True),
                                  StructField("text", StringType(), True),
                                  StructField("hashtags", StringType(), True),
                                  StructField("retweet_count", IntegerType(), True),
                                  StructField("possibly_sensitive", BooleanType(), True),
                                  StructField("lang", StringType(), True),
                                  StructField("user_id", StringType(), True),
                                  StructField("user_name", StringType(), True),
                                  StructField("user_description", StringType(), True),
                                  StructField("user_verification", BooleanType(), True),
                                  StructField("user_followers_count", IntegerType(), True),
                                  StructField("user_friends_count", IntegerType(), True),
                                  StructField("user_created_at", StringType(), True),
                                  StructField("user_location", StringType(), True),])    
    
    def on_data(self, data):
        """"
        Método que irá receber o dados da API e, após decorrido o tempo
        passado no parâmetro 'persist_time', irá salvá-los em uma temp view.
        """
        try:
            tweet = Tweet()
            json_data = json.loads(data)
            if "limit" not in json_data:
                tweet.insert(json_data)
                self.listTweets.append(tweet.get_list())
                
                print(f"Tweets:{len(self.listTweets)}, Time:{(time.time() - self.start)}")
                if (time.time() - self.start) > self.persist_time:
                    try:
                        df = sqlContext.createDataFrame(data=self.listTweets, schema=self.schema)
                        df = df.withColumn("etl_load", F.current_timestamp())
                        df = df.withColumn("etl_load_partition_year", F.date_format("etl_load", "yyyy"))
                        df = df.withColumn("etl_load_partition_month", F.date_format("etl_load", "MM"))
                        df = df.withColumn("etl_load_partition_day", F.date_format("etl_load", "dd"))
                        df = df.withColumn("etl_load_partition_hour", F.date_format("etl_load", "HH"))
                        df.createOrReplaceTempView("tweets")
                    except BaseException as e:
                        print("Erro ao contruir 'df': " + str(e))
                    return False
        except BaseException as e:
            print("Error: " + str(e), "JSON fora do esperado:", json_data)
        return True
    
    def on_error(self, status_code):
        print("Error: " + status_code)
        return True
    
    def on_timeout(self):
        print("Timeout!")
        return True

def authentication():
    """
    Irá buscar as credenciais em uma pasta local e retornar o objeto auth:OAuthHandler
    """
    PATH = "D:/William/Computação/Data Science/keys/TwitterAPI/{0}"
    
    API_KEY = open(file=PATH.format("API_key.txt"), mode="r", encoding="UTF-8").read()
    API_SECRET_KEY = open(file=PATH.format("API_secret_key.txt"), mode="r", encoding="UTF-8").read()
    ACCESS_TOKEN = open(file=PATH.format("access_token.txt"), mode="r", encoding="UTF-8").read()
    ACCESS_TOKEN_SECRET = open(file=PATH.format("access_token_secret.txt"), mode="r", encoding="UTF-8").read()
    
    auth = OAuthHandler(API_KEY, API_SECRET_KEY)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    return auth

def getData(keywords, languages=None, timeout=None, persist_time=60):
    """
    Método que irá realizar os passo-a-passo de autenticação, conifguração e start do streamig
    e retornar os dados salvos na temp view assim que decorrido o tempo do parâmetro 'persist_time'.
    """
    print('authenticating...')
    auth = authentication()
    print('start twitter listener...')
    twitter_listener = TwitterListener(persist_time=persist_time)
    print('start twitter stream...')
    twitter_stream = Stream(auth=auth, listener=twitter_listener, timeout=timeout)
    try:
        print("getting data...")
        twitter_stream.filter(track=keywords, is_async=False, languages=languages)
        twitter_stream.disconnect()
        return sqlContext.sql("select * from tweets")
    except BaseException as e:
        print("Error: " + str(e))

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    spark = SparkSession.builder.getOrCreate()
    sqlContext = SQLContext(sc)
    
    keywords = ["COVID"]
    languages=[]
    timeout = 60 # não funciona mto bem
    persist_time=10
    path = "./test/tweets"

    while True:
        df = getData(keywords=keywords,
                     languages=languages,
                     timeout=timeout, # não funciona mto bem
                     persist_time=persist_time,
                    )
        
        print("saving parquet...")
#         df.write\
#           .partitionBy("etl_load_partition_year",
#                        "etl_load_partition_month",
#                        "etl_load_partition_day",
#                        "etl_load_partition_hour")\
#           .format("parquet")\
#           .mode("append")\
#           .save(path)

In [45]:
df = spark.read.parquet("./tweets").toPandas()

In [46]:
df.count()

tweet_id                    155
created_at                  155
text                        155
hashtags                    155
retweet_count               155
possibly_sensitive           33
lang                        155
user_id                     155
user_name                   155
user_description            133
user_verification           155
user_followers_count        155
user_friends_count          155
user_created_at             155
user_location               109
etl_load                    155
etl_load_partition_year     155
etl_load_partition_month    155
etl_load_partition_day      155
etl_load_partition_hour     155
dtype: int64

In [47]:
df.describe()

Unnamed: 0,retweet_count,user_followers_count,user_friends_count,etl_load_partition_year,etl_load_partition_month,etl_load_partition_day,etl_load_partition_hour
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,0.0,1300.954839,1001.690323,2021.0,2.0,7.0,21.0
std,0.0,3003.012365,1929.897698,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,2021.0,2.0,7.0,21.0
25%,0.0,102.0,201.0,2021.0,2.0,7.0,21.0
50%,0.0,317.0,491.0,2021.0,2.0,7.0,21.0
75%,0.0,905.0,881.5,2021.0,2.0,7.0,21.0
max,0.0,20890.0,16130.0,2021.0,2.0,7.0,21.0


In [48]:
df

Unnamed: 0,tweet_id,created_at,text,hashtags,retweet_count,possibly_sensitive,lang,user_id,user_name,user_description,user_verification,user_followers_count,user_friends_count,user_created_at,user_location,etl_load,etl_load_partition_year,etl_load_partition_month,etl_load_partition_day,etl_load_partition_hour
0,1358569906057412612,Mon Feb 08 00:14:41 +0000 2021,O palmeiras é uma piada 🤣🤣🤣🤣 \nQue alegria \nT...,,0,,pt,1857330818,JHF,Sócio do S. C. CORINTHIANS PAULISTA BRASILEIRO...,False,10786,1067,Thu Sep 12 12:52:01 +0000 2013,"São Paulo, Brasil",2021-02-07 21:14:55.531,2021,2,7,21
1,1358569907743567872,Mon Feb 08 00:14:41 +0000 2021,RT @Felippe_Hermes: A informação que recebi AG...,,0,,pt,167382253,Mrs. Suna - VacinaJá!,"Artes, esportes, política, cultura, viagens e ...",False,1177,1531,Fri Jul 16 12:59:55 +0000 2010,https://qaphqa.myportfolio.com,2021-02-07 21:14:55.531,2021,2,7,21
2,1358569908523655170,Mon Feb 08 00:14:41 +0000 2021,"RT @rdgrenal: 🗣 Cláudio Oderich, vice-presiden...","Grêmio,",0,,pt,1030666792912465925,𝚁𝚎𝚌𝚞𝚜 🇦🇹,“Somos a Torcida dos Macacos 🇦🇹🎺” 04/04/1909❤️,False,249,578,Sat Aug 18 04:04:47 +0000 2018,Aonde o Inter for jogar,2021-02-07 21:14:55.531,2021,2,7,21
3,1358569908666331139,Mon Feb 08 00:14:41 +0000 2021,RT @TrumpMargareth: Até o Valdemiro Santiago t...,,0,,pt,1243544344189177858,Raquel,"Serva do Deus Altíssimo, remida pelo Senhor Je...",False,561,858,Fri Mar 27 14:24:26 +0000 2020,"São Paulo, Brasil",2021-02-07 21:14:55.531,2021,2,7,21
4,1358569908947288073,Mon Feb 08 00:14:41 +0000 2021,RT @viniciusff09: Danilo Avelar com 13 anos de...,,0,,pt,612690273,Thierry campeão da América 😎,Paulista que mora em Goiás//@Palmeiras 🐷💚// 1....,False,668,1189,Tue Jun 19 14:53:26 +0000 2012,"Formosa, Brasil",2021-02-07 21:14:55.531,2021,2,7,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1358570146458193925,Mon Feb 08 00:15:38 +0000 2021,"RT @SPFutDepre: Em uma semana, vocês tiram o c...",,0,,pt,1355328268115828737,Bruna,@Palmeiras,False,19,38,Sat Jan 30 01:34:11 +0000 2021,,2021-02-07 21:15:46.071,2021,2,7,21
151,1358569997824573441,Mon Feb 08 00:15:03 +0000 2021,Que bom que o palmeiras perdeu no dia de hoje,,0,,pt,1041392139685572609,RO,pq chorax Caio casxtro?,False,305,196,Sun Sep 16 18:23:29 +0000 2018,"Prudentópolis, Brasil",2021-02-07 21:15:12.940,2021,2,7,21
152,1358570001880518656,Mon Feb 08 00:15:04 +0000 2021,RT @mundodabola: OFICIAL: O Palmeiras continua...,,0,,pt,865039331815280642,Janinha,"Aqui só tem derrota, Messi e BBB",False,580,580,Thu May 18 03:00:23 +0000 2017,"Amazonas, Brasil",2021-02-07 21:15:12.940,2021,2,7,21
153,1358570005357617153,Mon Feb 08 00:15:04 +0000 2021,@vivinobregaa @jmdiloras Eu sou obrigado a con...,,0,,pt,3073903210,my guel,apenas um doente por esportes,False,206,246,Fri Mar 06 00:55:05 +0000 2015,"Manaíra, João Pessoa",2021-02-07 21:15:12.940,2021,2,7,21
