In [14]:
from tweepy.streaming import StreamListener
from tweepy import Stream
from tweepy import OAuthHandler

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F

import json
import time
import os

class Tweet():
    tweet_id: str = None
    created_at: str = None
    text: str = None
    hashtags: str = None
    retweet_count: int = None
    possibly_sensitive: bool = None
    lang: str = None
    user_id: str = None
    user_name: str = None
    user_description: str = None
    user_verification: bool = None
    user_followers_count: int = None
    user_friends_count: int = None
    user_created_at: str = None
    user_location: str = None

    def get_json(self):
        json_data = {
            'tweet_id':self.tweet_id,
            'created_at':self.created_at,
            'text':self.text,
            'hashtags':self.hashtags,
            'retweet_count':self.retweet_count,
            'possibly_sensitive':self.possibly_sensitive,
            'lang':self.lang,
            'user_id':self.user_id,
            'user_name':self.user_name,
            'user_description':self.user_description,
            'user_verification':self.user_verification,
            'user_followers_count':self.user_followers_count,
            'user_friends_count':self.user_friends_count,
            'user_created_at':self.user_created_at,
            'user_location':self.user_location,
        }
        return json_data
    
    def get_list(self):
        list_data = [
            self.tweet_id,
            self.created_at,
            self.text,
            self.hashtags,
            self.retweet_count,
            self.possibly_sensitive,
            self.lang,
            self.user_id,
            self.user_name,
            self.user_description,
            self.user_verification,
            self.user_followers_count,
            self.user_friends_count,
            self.user_created_at,
            self.user_location,
        ]
        return list_data

class TwitterListener(StreamListener):
    def __init__(self, persist_time):
        self.persist_time = persist_time
        self.start = time.time()
        self.listTweets = []
        self.schema = StructType([StructField("tweet_id", StringType(), True),
                                  StructField("created_at", StringType(), True),
                                  StructField("text", StringType(), True),
                                  StructField("hashtags", StringType(), True),
                                  StructField("retweet_count", IntegerType(), True),
                                  StructField("possibly_sensitive", BooleanType(), True),
                                  StructField("lang", StringType(), True),
                                  StructField("user_id", StringType(), True),
                                  StructField("user_name", StringType(), True),
                                  StructField("user_description", StringType(), True),
                                  StructField("user_verification", BooleanType(), True),
                                  StructField("user_followers_count", IntegerType(), True),
                                  StructField("user_friends_count", IntegerType(), True),
                                  StructField("user_created_at", StringType(), True),
                                  StructField("user_location", StringType(), True),])
    
    def constructor(self, data):
        tweet = Tweet()
        hashtags = ""
        if "extended_tweet" in data:
            for h in data['extended_tweet']["entities"]["hashtags"]:
                hashtags = h["text"] + ", " + hashtags
        else:
            for h in data["entities"]["hashtags"]:
                hashtags = h["text"] + ", " + hashtags
        tweet.tweet_id = data["id"]
        tweet.created_at = data["created_at"]
        tweet.text = data['extended_tweet']['full_text'] if "extended_tweet" in data else data["text"]
        tweet.user_id = data["user"]["id"]
        tweet.hashtags = hashtags
        tweet.retweet_count = data["retweet_count"]
        tweet.possibly_sensitive = data['possibly_sensitive'] if "possibly_sensitive" in data else None
        tweet.lang = data["lang"]
        tweet.user_id = data["user"]["id"]
        tweet.user_name = data["user"]["name"]
        tweet.user_description = data["user"]["description"]
        tweet.user_verification = data["user"]["verified"]
        tweet.user_followers_count = data["user"]["followers_count"]
        tweet.user_friends_count = data["user"]["friends_count"]
        tweet.user_created_at = data["user"]["created_at"]
        tweet.user_location = data["user"]["location"]
        return tweet
    
    def on_data(self, data):
        try:
            json_data = json.loads(data)
            tweet = self.constructor(json_data)
            self.listTweets.append(tweet.get_list())
            print(f"Tweets:{len(self.listTweets)}, Time:{(time.time() - self.start)}")
            if (time.time() - self.start) > self.persist_time:
                print("entrou aqui")
                try:
                    df = sqlContext.createDataFrame(data=self.listTweets, schema=self.schema)
                    df = df.withColumn("etl_load", F.current_timestamp())
                    df = df.withColumn("etl_load_partition_year", F.date_format("etl_load", "yyyy"))
                    df = df.withColumn("etl_load_partition_month", F.date_format("etl_load", "MM"))
                    df = df.withColumn("etl_load_partition_day", F.date_format("etl_load", "dd"))
                    df = df.withColumn("etl_load_partition_hour", F.date_format("etl_load", "HH"))
                    df.createOrReplaceTempView("tweets")
                except BaseException as e:
                    print("Error: " + str(e))
                return False
        except BaseException as e:
            print("Error: " + str(e))
        return True
    
    def on_error(self, status_code):
        print("Error: " + status_code)
        return True
    
    def on_timeout(self):
        print("Timeout!")
        return True

def authentication():
    PATH = "D:/William/Computação/Data Science/Cases/SerasaExperian/{0}"
    
    API_KEY = open(file=PATH.format("API_key.txt"), mode="r", encoding="UTF-8").read()
    API_SECRET_KEY = open(file=PATH.format("API_secret_key.txt"), mode="r", encoding="UTF-8").read()
    ACCESS_TOKEN = open(file=PATH.format("access_token.txt"), mode="r", encoding="UTF-8").read()
    ACCESS_TOKEN_SECRET = open(file=PATH.format("access_token_secret.txt"), mode="r", encoding="UTF-8").read()
    
    auth = OAuthHandler(API_KEY, API_SECRET_KEY)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
    return auth

def getData(keywords, languages=None, timeout=None, persist_time=60):
    
    listTweets = []
    
    print('authenticating...')
    auth = authentication()
    print('start twitter listener...')
    twitter_listener = TwitterListener(persist_time=persist_time)
    print('start twitter streaming...')
    twitter_stream = Stream(auth=auth, listener=twitter_listener, timeout=timeout)
    try:
        print("getting data...")
        twitter_stream.filter(track=keywords, is_async=False, languages=languages)
        twitter_stream.disconnect()
        return sqlContext.sql("select * from tweets")
    except BaseException as e:
        print("Error: " + str(e))

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    spark = SparkSession.builder.getOrCreate()
    sqlContext = SQLContext(sc)
    while True:
        df = getData(keywords=['PALMEIRAS'],
                     languages=["pt"],
                     timeout=60, # não funciona mto bem
                     persist_time=10
                    )
        print("saving parquet...")
        path = "./tweets"
        df.write\
          .partitionBy("etl_load_partition_year",
                       "etl_load_partition_month",
                       "etl_load_partition_day",
                       "etl_load_partition_hour")\
          .format("parquet")\
          .mode("append")\
          .save(path)

authenticating...
start twitter listener...
start twitter streaming...
getting data...
Tweets:1, Time:0.5693039894104004
Tweets:2, Time:0.6201815605163574
Tweets:3, Time:0.6656029224395752
Tweets:4, Time:0.7961657047271729
Tweets:5, Time:0.9839065074920654
Tweets:6, Time:0.9983253479003906
Tweets:7, Time:1.0128540992736816
Tweets:8, Time:1.1096277236938477
Tweets:9, Time:1.1903934478759766
Tweets:10, Time:1.1924235820770264
Tweets:11, Time:1.3852579593658447
Tweets:12, Time:1.4015486240386963
Tweets:13, Time:1.427891492843628
Tweets:14, Time:1.4416990280151367
Tweets:15, Time:1.5958380699157715
Tweets:16, Time:1.7087898254394531
Tweets:17, Time:1.8637332916259766
Tweets:18, Time:1.8849284648895264
Tweets:19, Time:2.021592617034912
Tweets:20, Time:2.12148380279541
Tweets:21, Time:2.147087335586548
Tweets:22, Time:2.342843532562256
Tweets:23, Time:2.348914861679077
Tweets:24, Time:2.351024866104126
Tweets:25, Time:2.47564697265625
Tweets:26, Time:2.495025634765625
Tweets:27, Time:2.58450

Tweets:89, Time:5.922586917877197
Tweets:90, Time:6.015701770782471
Tweets:91, Time:6.0907063484191895
Tweets:92, Time:6.094850778579712
Tweets:93, Time:6.119600534439087
Tweets:94, Time:6.130310773849487
Tweets:95, Time:6.307805299758911
Tweets:96, Time:6.328420639038086
Tweets:97, Time:6.433093309402466
Tweets:98, Time:6.46770453453064
Tweets:99, Time:6.472679376602173
Tweets:100, Time:6.504549980163574
Tweets:101, Time:6.53624153137207
Tweets:102, Time:6.561216831207275
Tweets:103, Time:6.632401943206787
Tweets:104, Time:6.6987574100494385
Tweets:105, Time:6.896218299865723
Tweets:106, Time:7.085248708724976
Tweets:107, Time:7.110798597335815
Tweets:108, Time:7.123869180679321
Tweets:109, Time:7.283877372741699
Tweets:110, Time:7.411835193634033
Tweets:111, Time:7.434054613113403
Tweets:112, Time:7.461548805236816
Tweets:113, Time:7.665704965591431
Tweets:114, Time:7.721015453338623
Tweets:115, Time:7.744479656219482
Tweets:116, Time:7.803883790969849
Tweets:117, Time:7.867812395095

Tweets:78, Time:5.930263519287109
Tweets:79, Time:6.003499984741211
Tweets:80, Time:6.136979103088379
Tweets:81, Time:6.1530585289001465
Tweets:82, Time:6.192535877227783
Tweets:83, Time:6.289695501327515
Tweets:84, Time:6.303592920303345
Tweets:85, Time:6.380993604660034
Tweets:86, Time:6.566121339797974
Tweets:87, Time:6.817788362503052
Tweets:88, Time:6.826584100723267
Tweets:89, Time:6.847082853317261
Tweets:90, Time:6.886280536651611
Tweets:91, Time:6.942107200622559
Tweets:92, Time:6.948191404342651
Tweets:93, Time:7.100288152694702
Tweets:94, Time:7.155540227890015
Tweets:95, Time:7.312672138214111
Tweets:96, Time:7.339231491088867
Tweets:97, Time:7.360943794250488
Tweets:98, Time:7.4234678745269775
Tweets:99, Time:7.576035976409912
Tweets:100, Time:7.579145669937134
Tweets:101, Time:7.619579792022705
Tweets:102, Time:7.683528184890747
Tweets:103, Time:8.02303147315979
Tweets:104, Time:8.049488544464111
Tweets:105, Time:8.271443843841553
Tweets:106, Time:8.30039119720459
Tweets:

Tweets:16, Time:1.6521759033203125
Tweets:17, Time:1.7261202335357666
Tweets:18, Time:1.8005397319793701
Tweets:19, Time:1.931591510772705
Tweets:20, Time:1.989654779434204
Tweets:21, Time:1.989654779434204
Tweets:22, Time:1.9907002449035645
Tweets:23, Time:2.1645634174346924
Tweets:24, Time:2.1788711547851562
Tweets:25, Time:2.298727512359619
Tweets:26, Time:2.307582139968872
Tweets:27, Time:2.349303960800171
Tweets:28, Time:2.3583004474639893
Tweets:29, Time:2.440803050994873
Tweets:30, Time:2.6310744285583496
Tweets:31, Time:2.7565438747406006
Tweets:32, Time:2.7565438747406006
Tweets:33, Time:2.8247604370117188
Tweets:34, Time:2.832771062850952
Tweets:35, Time:2.8716468811035156
Tweets:36, Time:2.8766345977783203
Tweets:37, Time:2.9106669425964355
Tweets:38, Time:2.9146506786346436
Tweets:39, Time:2.917253255844116
Tweets:40, Time:2.9302847385406494
Tweets:41, Time:2.9348368644714355
Tweets:42, Time:2.9879651069641113
Tweets:43, Time:3.0859665870666504
Tweets:44, Time:3.16409564018

Tweets:100, Time:7.3352203369140625
Tweets:101, Time:7.394477605819702
Tweets:102, Time:7.471723556518555
Tweets:103, Time:7.538809299468994
Tweets:104, Time:7.589164733886719
Tweets:105, Time:7.645122528076172
Tweets:106, Time:7.6694276332855225
Tweets:107, Time:7.670429944992065
Tweets:108, Time:7.6853930950164795
Tweets:109, Time:7.700140476226807
Tweets:110, Time:7.749374151229858
Tweets:111, Time:7.752387285232544
Tweets:112, Time:7.8162219524383545
Tweets:113, Time:7.836122512817383
Tweets:114, Time:7.8682334423065186
Tweets:115, Time:7.876762628555298
Tweets:116, Time:7.899725914001465
Tweets:117, Time:7.992327928543091
Tweets:118, Time:8.034189224243164
Tweets:119, Time:8.104013919830322
Tweets:120, Time:8.124946594238281
Tweets:121, Time:8.145890474319458
Tweets:122, Time:8.145890474319458
Tweets:123, Time:8.192772626876831
Tweets:124, Time:8.33487343788147
Tweets:125, Time:8.415061712265015
Tweets:126, Time:8.478881597518921
Tweets:127, Time:8.575718879699707
Tweets:128, Time

Tweets:13, Time:1.3468513488769531
Tweets:14, Time:1.5220410823822021
Tweets:15, Time:1.572887897491455
Tweets:16, Time:1.5738849639892578
Tweets:17, Time:1.6516318321228027
Tweets:18, Time:1.6526355743408203
Tweets:19, Time:1.6576738357543945
Tweets:20, Time:1.6973135471343994
Tweets:21, Time:1.6973135471343994
Tweets:22, Time:1.7097194194793701
Tweets:23, Time:1.7394921779632568
Tweets:24, Time:1.8645734786987305
Tweets:25, Time:1.8770887851715088
Tweets:26, Time:1.918752670288086
Tweets:27, Time:1.9971351623535156
Tweets:28, Time:2.009498119354248
Tweets:29, Time:2.0641231536865234
Tweets:30, Time:2.0776708126068115
Tweets:31, Time:2.0979089736938477
Tweets:32, Time:2.125540018081665
Tweets:33, Time:2.135326385498047
Tweets:34, Time:2.2236411571502686
Tweets:35, Time:2.241300344467163
Tweets:36, Time:2.244849443435669
Tweets:37, Time:2.2824366092681885
Tweets:38, Time:2.302771806716919
Tweets:39, Time:2.3792688846588135
Tweets:40, Time:2.460919141769409
Tweets:41, Time:2.48704814910

KeyboardInterrupt: 

In [8]:
df_p

Unnamed: 0,tweet_id,created_at,text,hashtags,retweet_count,possibly_sensitive,lang,user_id,user_name,user_description,user_verification,user_followers_count,user_friends_count,user_created_at,user_location,etl_load,etl_load_partition_year,etl_load_partition_month,etl_load_partition_day,etl_load_partition_hour
0,1358494904582893577,Sun Feb 07 19:16:39 +0000 2021,O Palmeiras tem q aprender com o Corinthians c...,,0,,pt,1199478938105532419,Luizin,🔥⚽️🤘🏻,False,67,202,Wed Nov 27 00:04:22 +0000 2019,"Franca, Brasil",2021-02-07 16:18:00.667,2021,02,07,16
1,1358494904838717441,Sun Feb 07 19:16:39 +0000 2021,"o palmeiras é bom mano, mas esse Melo como eu ...",,0,,pt,2944934794,CALMAI,uma pré-vestibulanda desesperada,False,198,222,Sat Dec 27 22:54:45 +0000 2014,,2021-02-07 16:18:00.667,2021,02,07,16
2,1358494904926748672,Sun Feb 07 19:16:39 +0000 2021,@Palmeiras Zé Rafael perdeu a bola fácil e Lua...,,0,,pt,196406665,Paulinho,,False,20,175,Wed Sep 29 01:25:06 +0000 2010,,2021-02-07 16:18:00.667,2021,02,07,16
3,1358494905216233473,Sun Feb 07 19:16:39 +0000 2021,Palmeiras tomando no cu kkkkkk,,0,,pt,50484482,Marcello,90'boy'z.,False,405,163,Wed Jun 24 23:48:25 +0000 2009,Leme,2021-02-07 16:18:00.667,2021,02,07,16
4,1358494905279070215,Sun Feb 07 19:16:39 +0000 2021,BORAAAAAA PALMEIRAS,,0,,pt,1213146818730528768,𝔭𝔯𝔦𝔰𝔠𝔦𝔩𝔞 🧁⃤,🤙🏻🤙🏻,False,17,320,Fri Jan 03 17:15:25 +0000 2020,"Campo Grande, Brasil",2021-02-07 16:18:00.667,2021,02,07,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1326,1358495152118116352,Sun Feb 07 19:17:38 +0000 2021,palmeiras nunca vai ter mundial kkkkkk,,0,,pt,1293605763865337856,marcosᶜʳᶠ,"Mengo é paixão, religião... uma certeza de fel...",False,4035,3467,Wed Aug 12 17:50:34 +0000 2020,errijota menóh,2021-02-07 16:18:00.667,2021,02,07,16
1327,1358495152499785733,Sun Feb 07 19:17:38 +0000 2021,CBF pode voltar o jogo do Palmeiras x Coritiba...,,0,,pt,87871454,Renato Alvesॐ,•Advogado|Defensor público\n•Jogador de futebo...,False,1480,969,Fri Nov 06 05:01:14 +0000 2009,,2021-02-07 16:18:00.667,2021,02,07,16
1328,1358495153338613766,Sun Feb 07 19:17:38 +0000 2021,E O PALMEIRAS EINNNM,,0,,pt,1238963027787071489,ai ai ai,os rótulos te condenam à hipocrisia,False,7,85,Sat Mar 14 22:59:49 +0000 2020,,2021-02-07 16:18:00.667,2021,02,07,16
1329,1358495153514835968,Sun Feb 07 19:17:38 +0000 2021,KKKKKKKKKK Palmeiras fazendo jus a piada,,0,,pt,906319431743262720,sarinha,@cruzeiro,False,134,256,Sat Sep 09 00:52:45 +0000 2017,,2021-02-07 16:18:00.667,2021,02,07,16
