In [2]:
from tweepy.streaming import StreamListener
from tweepy import Stream
from tweepy import OAuthHandler

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F

import json
import time

class Tweet():
    """
    Objeto com os atributos refrentes aos dados a serem extraídos do JSON da API do Twitter
    """
    tweet_id: str = None
    created_at: str = None
    text: str = None
    hashtags: str = None
    retweet_count: int = None
    possibly_sensitive: bool = None
    lang: str = None
    user_id: str = None
    user_name: str = None
    user_description: str = None
    user_verification: bool = None
    user_followers_count: int = None
    user_friends_count: int = None
    user_created_at: str = None
    user_location: str = None
    
    def get_list(self):
        """
        Retorna uma lista com os valores atribuídos ao objeto instanciado
        """
        list_data = [
            self.tweet_id,
            self.created_at,
            self.text,
            self.hashtags,
            self.retweet_count,
            self.possibly_sensitive,
            self.lang,
            self.user_id,
            self.user_name,
            self.user_description,
            self.user_verification,
            self.user_followers_count,
            self.user_friends_count,
            self.user_created_at,
            self.user_location,
        ]
        return list_data
    
    def insert(self, data):
        """
        Recebe o retorno da API e insere os dados em seus respectivos atributos
        """
        hashtags = ""
        if "extended_tweet" in data:
            for h in data['extended_tweet']["entities"]["hashtags"]:
                hashtags = h["text"] + ", " + hashtags
        else:
            for h in data["entities"]["hashtags"]:
                hashtags = h["text"] + ", " + hashtags
        self.tweet_id = data["id"]
        self.created_at = data["created_at"]
        self.text = data['extended_tweet']['full_text'] if "extended_tweet" in data else data["text"]
        self.user_id = data["user"]["id"]
        self.hashtags = hashtags
        self.retweet_count = data["retweet_count"]
        self.possibly_sensitive = data['possibly_sensitive'] if "possibly_sensitive" in data else None
        self.lang = data["lang"]
        self.user_id = data["user"]["id"]
        self.user_name = data["user"]["name"]
        self.user_description = data["user"]["description"]
        self.user_verification = data["user"]["verified"]
        self.user_followers_count = data["user"]["followers_count"]
        self.user_friends_count = data["user"]["friends_count"]
        self.user_created_at = data["user"]["created_at"]
        self.user_location = data["user"]["location"]

class TwitterListener(StreamListener):
    """
    Listener da API do Twitter, esta classe realiza o streaming de tweets,
    recebendo os tweets, processando e persistindo em uma temp view.
    """
    def __init__(self, persist_time):
        self.persist_time = persist_time
        self.start = time.time()
        self.listTweets = []
        self.schema = StructType([StructField("tweet_id", StringType(), True),
                                  StructField("created_at", StringType(), True),
                                  StructField("text", StringType(), True),
                                  StructField("hashtags", StringType(), True),
                                  StructField("retweet_count", IntegerType(), True),
                                  StructField("possibly_sensitive", BooleanType(), True),
                                  StructField("lang", StringType(), True),
                                  StructField("user_id", StringType(), True),
                                  StructField("user_name", StringType(), True),
                                  StructField("user_description", StringType(), True),
                                  StructField("user_verification", BooleanType(), True),
                                  StructField("user_followers_count", IntegerType(), True),
                                  StructField("user_friends_count", IntegerType(), True),
                                  StructField("user_created_at", StringType(), True),
                                  StructField("user_location", StringType(), True),])    
    
    def on_data(self, data):
        """"
        Método que irá receber o dados da API e, após decorrido o tempo
        passado no parâmetro 'persist_time', irá salvá-los em uma temp view.
        """
        try:
            tweet = Tweet()
            json_data = json.loads(data)
            if "limit" not in json_data:
                tweet.insert(json_data)
                self.listTweets.append(tweet.get_list())
                
                print(f"Tweets:{len(self.listTweets)}, Time:{(time.time() - self.start)}")
                if (time.time() - self.start) > self.persist_time:
                    try:
                        df = sqlContext.createDataFrame(data=self.listTweets, schema=self.schema)
                        df = df.withColumn("etl_load", F.current_timestamp())
                        df = df.withColumn("etl_load_partition_year", F.date_format("etl_load", "yyyy"))
                        df = df.withColumn("etl_load_partition_month", F.date_format("etl_load", "MM"))
                        df = df.withColumn("etl_load_partition_day", F.date_format("etl_load", "dd"))
                        df = df.withColumn("etl_load_partition_hour", F.date_format("etl_load", "HH"))
                        df.createOrReplaceTempView("tweets")
                    except BaseException as e:
                        print("Erro ao contruir 'df': " + str(e))
                    return False
        except BaseException as e:
            print("Error: " + str(e), "JSON fora do esperado:", json_data)
        return True
    
    def on_error(self, status_code):
        print("Error: " + status_code)
        return True
    
    def on_timeout(self):
        print("Timeout!")
        return True

def authentication():
    """
    Irá buscar as credenciais em uma pasta local e retornar o objeto auth:OAuthHandler
    """
    PATH = "D:/William/Computação/Data Science/keys/TwitterAPI/{0}"
    
    API_KEY = open(file=PATH.format("API_key.txt"), mode="r", encoding="UTF-8").read()
    API_SECRET_KEY = open(file=PATH.format("API_secret_key.txt"), mode="r", encoding="UTF-8").read()
    ACCESS_TOKEN = open(file=PATH.format("access_token.txt"), mode="r", encoding="UTF-8").read()
    ACCESS_TOKEN_SECRET = open(file=PATH.format("access_token_secret.txt"), mode="r", encoding="UTF-8").read()
    
    auth = OAuthHandler(API_KEY, API_SECRET_KEY)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    return auth

def getData(keywords, languages=None, timeout=None, persist_time=60):
    """
    Método que irá realizar os passo-a-passo de autenticação, conifguração e start do streamig
    e retornar os dados salvos na temp view assim que decorrido o tempo do parâmetro 'persist_time'.
    """
    print('authenticating...')
    auth = authentication()
    print('start twitter listener...')
    twitter_listener = TwitterListener(persist_time=persist_time)
    print('start twitter stream...')
    twitter_stream = Stream(auth=auth, listener=twitter_listener, timeout=timeout)
    try:
        print("getting data...")
        twitter_stream.filter(track=keywords, is_async=False, languages=languages)
        twitter_stream.disconnect()
        return sqlContext.sql("select * from tweets")
    except BaseException as e:
        print("Error: " + str(e))

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    spark = SparkSession.builder.getOrCreate()
    sqlContext = SQLContext(sc)
    
    keywords = ["PALMEIRAS"]
    languages=["pt"]
    timeout = 60 # não funciona mto bem
    persist_time=60
    path = "./test/tweets"

    while True:
        df = getData(keywords=keywords,
                     languages=languages,
                     timeout=timeout, # não funciona mto bem
                     persist_time=persist_time,
                    )
        
        print("saving parquet...")
        df.write\
          .partitionBy("etl_load_partition_year",
                       "etl_load_partition_month",
                       "etl_load_partition_day",
                       "etl_load_partition_hour")\
          .format("parquet")\
          .mode("append")\
          .save(path)

authenticating...
start twitter listener...
start twitter streaming...
getting data...
Tweets:1, Time:0.5891058444976807
Tweets:2, Time:1.4401617050170898
Tweets:3, Time:1.629821538925171
Tweets:4, Time:1.7543482780456543
Tweets:5, Time:2.2050907611846924
Tweets:6, Time:2.6128172874450684
Tweets:7, Time:2.791013479232788
Tweets:8, Time:3.3249127864837646
Tweets:9, Time:3.3319153785705566
Tweets:10, Time:3.634622097015381
Tweets:11, Time:3.8565380573272705
Tweets:12, Time:4.283930063247681
Tweets:13, Time:5.055507183074951
Tweets:14, Time:5.37442946434021
Tweets:15, Time:5.6622490882873535
Tweets:16, Time:5.826642274856567
Tweets:17, Time:5.998942613601685
Tweets:18, Time:6.48374605178833
Tweets:19, Time:6.701195478439331
Tweets:20, Time:6.810682773590088
Tweets:21, Time:6.99524998664856
Tweets:22, Time:7.762883186340332
Tweets:23, Time:7.7707743644714355
Tweets:24, Time:7.817060470581055
Tweets:25, Time:7.865277528762817
Tweets:26, Time:7.958301782608032
Tweets:27, Time:8.0200982093811

Tweets:35, Time:9.355704545974731
Tweets:36, Time:10.871169805526733
Tweets:37, Time:11.330545902252197
Tweets:38, Time:11.357697248458862
Tweets:39, Time:11.632275581359863
Tweets:40, Time:11.805831670761108
Tweets:41, Time:12.633301496505737
Tweets:42, Time:12.729269027709961
Tweets:43, Time:13.148850679397583
Tweets:44, Time:13.346696615219116
Tweets:45, Time:13.734630107879639
Tweets:46, Time:14.346288681030273
Tweets:47, Time:14.494020462036133
Tweets:48, Time:14.838148355484009
Tweets:49, Time:15.533449172973633
Tweets:50, Time:15.569395303726196
Tweets:51, Time:15.782625675201416
Tweets:52, Time:15.861028671264648
Tweets:53, Time:16.59672999382019
Tweets:54, Time:17.661277055740356
Tweets:55, Time:17.758901596069336
Tweets:56, Time:17.987530946731567
Tweets:57, Time:18.794563055038452
Tweets:58, Time:19.057000398635864
Tweets:59, Time:19.07789945602417
Tweets:60, Time:19.19331455230713
Tweets:61, Time:19.321611642837524
Tweets:62, Time:19.380621433258057
Tweets:63, Time:19.59879

Tweets:97, Time:28.340240478515625
Tweets:98, Time:28.385687112808228
Tweets:99, Time:29.20387578010559
Tweets:100, Time:29.717285633087158
Tweets:101, Time:29.967254161834717
Tweets:102, Time:30.15106725692749
Tweets:103, Time:30.33651876449585
Tweets:104, Time:30.760520458221436
Tweets:105, Time:30.856218099594116
Tweets:106, Time:30.8655047416687
Tweets:107, Time:31.807068586349487
Tweets:108, Time:31.936387062072754
Tweets:109, Time:32.124594926834106
Tweets:110, Time:32.352360010147095
Tweets:111, Time:32.58174276351929
Tweets:112, Time:32.618603467941284
Tweets:113, Time:32.65882682800293
Tweets:114, Time:33.35470414161682
Tweets:115, Time:33.60507035255432
Tweets:116, Time:33.6361403465271
Tweets:117, Time:33.65753531455994
Tweets:118, Time:33.98770999908447
Tweets:119, Time:34.040125608444214
Tweets:120, Time:34.22670865058899
Tweets:121, Time:34.24969792366028
Tweets:122, Time:34.43140149116516
Tweets:123, Time:34.77626419067383
Tweets:124, Time:34.86432862281799
Tweets:125, T

Tweets:123, Time:27.759441375732422
Tweets:124, Time:28.11400032043457
Tweets:125, Time:28.183837413787842
Tweets:126, Time:28.97345733642578
Tweets:127, Time:29.206969738006592
Tweets:128, Time:29.94968557357788
Tweets:129, Time:30.499476194381714
Tweets:130, Time:31.24804162979126
Tweets:131, Time:31.63069796562195
Tweets:132, Time:31.8676860332489
Tweets:133, Time:31.876068353652954
Tweets:134, Time:32.020549297332764
Tweets:135, Time:32.316670656204224
Tweets:136, Time:32.953720808029175
Tweets:137, Time:33.36289978027344
Tweets:138, Time:33.56891989707947
Tweets:139, Time:33.58207583427429
Tweets:140, Time:33.76153779029846
Tweets:141, Time:34.07534742355347
Tweets:142, Time:34.3951416015625
Tweets:143, Time:34.8155517578125
Tweets:144, Time:35.32724571228027
Tweets:145, Time:35.55670380592346
Tweets:146, Time:35.55770015716553
Tweets:147, Time:35.73340201377869
Tweets:148, Time:36.02767539024353
Tweets:149, Time:36.20504808425903
Tweets:150, Time:36.55092000961304
Tweets:151, Tim

Tweets:118, Time:29.505521774291992
Tweets:119, Time:29.725823163986206
Tweets:120, Time:29.915929317474365
Tweets:121, Time:30.012825965881348
Tweets:122, Time:30.181379556655884
Tweets:123, Time:30.191786289215088
Tweets:124, Time:30.872758626937866
Tweets:125, Time:31.585570573806763
Tweets:126, Time:31.732226610183716
Tweets:127, Time:31.794886112213135
Tweets:128, Time:32.008920431137085
Tweets:129, Time:32.174524784088135
Tweets:130, Time:32.19066286087036
Tweets:131, Time:32.41823625564575
Tweets:132, Time:32.496599435806274
Tweets:133, Time:32.615721225738525
Tweets:134, Time:32.826292753219604
Tweets:135, Time:32.90713143348694
Tweets:136, Time:32.98505520820618
Tweets:137, Time:33.1674382686615
Tweets:138, Time:33.204655170440674
Tweets:139, Time:33.429548501968384
Tweets:140, Time:33.43016076087952
Tweets:141, Time:33.66747570037842
Tweets:142, Time:33.818042278289795
Tweets:143, Time:34.110326051712036
Tweets:144, Time:34.31930184364319
Tweets:145, Time:34.356746435165405
T

Tweets:111, Time:27.433119773864746
Tweets:112, Time:27.449073314666748
Tweets:113, Time:27.86859107017517
Tweets:114, Time:27.890663385391235
Tweets:115, Time:27.975757837295532
Tweets:116, Time:27.994563579559326
Tweets:117, Time:28.125564575195312
Tweets:118, Time:28.43836283683777
Tweets:119, Time:28.944753408432007
Tweets:120, Time:29.558284282684326
Tweets:121, Time:29.571613073349
Tweets:122, Time:29.6360604763031
Tweets:123, Time:29.709127187728882
Tweets:124, Time:29.7805016040802
Tweets:125, Time:29.950900316238403
Tweets:126, Time:30.03915023803711
Tweets:127, Time:30.26810598373413
Tweets:128, Time:30.47481870651245
Tweets:129, Time:30.622480869293213
Tweets:130, Time:30.820033073425293
Tweets:131, Time:30.853638410568237
Tweets:132, Time:31.237627506256104
Tweets:133, Time:31.350454807281494
Tweets:134, Time:31.665339469909668
Tweets:135, Time:31.668678998947144
Tweets:136, Time:31.722543954849243
Tweets:137, Time:31.793102979660034
Tweets:138, Time:31.80399489402771
Tweet

AttributeError: 'NoneType' object has no attribute 'write'

In [45]:
df = spark.read.parquet("./tweets").toPandas()

In [46]:
df.count()

tweet_id                    155
created_at                  155
text                        155
hashtags                    155
retweet_count               155
possibly_sensitive           33
lang                        155
user_id                     155
user_name                   155
user_description            133
user_verification           155
user_followers_count        155
user_friends_count          155
user_created_at             155
user_location               109
etl_load                    155
etl_load_partition_year     155
etl_load_partition_month    155
etl_load_partition_day      155
etl_load_partition_hour     155
dtype: int64

In [47]:
df.describe()

Unnamed: 0,retweet_count,user_followers_count,user_friends_count,etl_load_partition_year,etl_load_partition_month,etl_load_partition_day,etl_load_partition_hour
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,0.0,1300.954839,1001.690323,2021.0,2.0,7.0,21.0
std,0.0,3003.012365,1929.897698,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,2021.0,2.0,7.0,21.0
25%,0.0,102.0,201.0,2021.0,2.0,7.0,21.0
50%,0.0,317.0,491.0,2021.0,2.0,7.0,21.0
75%,0.0,905.0,881.5,2021.0,2.0,7.0,21.0
max,0.0,20890.0,16130.0,2021.0,2.0,7.0,21.0


In [48]:
df

Unnamed: 0,tweet_id,created_at,text,hashtags,retweet_count,possibly_sensitive,lang,user_id,user_name,user_description,user_verification,user_followers_count,user_friends_count,user_created_at,user_location,etl_load,etl_load_partition_year,etl_load_partition_month,etl_load_partition_day,etl_load_partition_hour
0,1358569906057412612,Mon Feb 08 00:14:41 +0000 2021,O palmeiras é uma piada 🤣🤣🤣🤣 \nQue alegria \nT...,,0,,pt,1857330818,JHF,Sócio do S. C. CORINTHIANS PAULISTA BRASILEIRO...,False,10786,1067,Thu Sep 12 12:52:01 +0000 2013,"São Paulo, Brasil",2021-02-07 21:14:55.531,2021,2,7,21
1,1358569907743567872,Mon Feb 08 00:14:41 +0000 2021,RT @Felippe_Hermes: A informação que recebi AG...,,0,,pt,167382253,Mrs. Suna - VacinaJá!,"Artes, esportes, política, cultura, viagens e ...",False,1177,1531,Fri Jul 16 12:59:55 +0000 2010,https://qaphqa.myportfolio.com,2021-02-07 21:14:55.531,2021,2,7,21
2,1358569908523655170,Mon Feb 08 00:14:41 +0000 2021,"RT @rdgrenal: 🗣 Cláudio Oderich, vice-presiden...","Grêmio,",0,,pt,1030666792912465925,𝚁𝚎𝚌𝚞𝚜 🇦🇹,“Somos a Torcida dos Macacos 🇦🇹🎺” 04/04/1909❤️,False,249,578,Sat Aug 18 04:04:47 +0000 2018,Aonde o Inter for jogar,2021-02-07 21:14:55.531,2021,2,7,21
3,1358569908666331139,Mon Feb 08 00:14:41 +0000 2021,RT @TrumpMargareth: Até o Valdemiro Santiago t...,,0,,pt,1243544344189177858,Raquel,"Serva do Deus Altíssimo, remida pelo Senhor Je...",False,561,858,Fri Mar 27 14:24:26 +0000 2020,"São Paulo, Brasil",2021-02-07 21:14:55.531,2021,2,7,21
4,1358569908947288073,Mon Feb 08 00:14:41 +0000 2021,RT @viniciusff09: Danilo Avelar com 13 anos de...,,0,,pt,612690273,Thierry campeão da América 😎,Paulista que mora em Goiás//@Palmeiras 🐷💚// 1....,False,668,1189,Tue Jun 19 14:53:26 +0000 2012,"Formosa, Brasil",2021-02-07 21:14:55.531,2021,2,7,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,1358570146458193925,Mon Feb 08 00:15:38 +0000 2021,"RT @SPFutDepre: Em uma semana, vocês tiram o c...",,0,,pt,1355328268115828737,Bruna,@Palmeiras,False,19,38,Sat Jan 30 01:34:11 +0000 2021,,2021-02-07 21:15:46.071,2021,2,7,21
151,1358569997824573441,Mon Feb 08 00:15:03 +0000 2021,Que bom que o palmeiras perdeu no dia de hoje,,0,,pt,1041392139685572609,RO,pq chorax Caio casxtro?,False,305,196,Sun Sep 16 18:23:29 +0000 2018,"Prudentópolis, Brasil",2021-02-07 21:15:12.940,2021,2,7,21
152,1358570001880518656,Mon Feb 08 00:15:04 +0000 2021,RT @mundodabola: OFICIAL: O Palmeiras continua...,,0,,pt,865039331815280642,Janinha,"Aqui só tem derrota, Messi e BBB",False,580,580,Thu May 18 03:00:23 +0000 2017,"Amazonas, Brasil",2021-02-07 21:15:12.940,2021,2,7,21
153,1358570005357617153,Mon Feb 08 00:15:04 +0000 2021,@vivinobregaa @jmdiloras Eu sou obrigado a con...,,0,,pt,3073903210,my guel,apenas um doente por esportes,False,206,246,Fri Mar 06 00:55:05 +0000 2015,"Manaíra, João Pessoa",2021-02-07 21:15:12.940,2021,2,7,21
