### Dependencies

In [1]:
# Use these commands to install required dependencies if necessary.

# !pip install findspark pyspark py4j
# !pip install pandas seaborn numpy
# !pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio===0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# !pip install transformers
# !pip install tqdm
# !pip install emoji
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

# Use this command if the above installation of PyTorch fails.

# !pip install torch torchvision torchaudio

### Spark Set-Up

In [2]:
import findspark

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import desc
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
# Constants for FILE PATHS

#SPARK_PATH = '/home/vishakan/spark-3.2.1-bin-hadoop3.2'
SPARK_PATH = '/home/venky/spark-3.2.1-bin-hadoop3.2'

In [4]:
findspark.init(SPARK_PATH)
findspark.add_packages("org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1")    #Required dependency

In [5]:
spark = SparkSession.builder.appName("FYP").getOrCreate()
spark

In [6]:
#Run this only once, restart kernel if errors
sc = spark.sparkContext
sc

#### Code To Ignore Warning Messages

In [7]:
#Doesn't seem to work here properly

import warnings
warnings.filterwarnings('ignore')

warnings.filterwarnings(action='once')

In [8]:
%%javascript
(function(on) {
const e=$( "<a>Setup failed</a>" );
const ns="js_jupyter_suppress_warnings";
var cssrules=$("#"+ns);
if(!cssrules.length) cssrules = $("<style id='"+ns+"' type='text/css'>div.output_stderr { } </style>").appendTo("head");
e.click(function() {
    var s='Showing';  
    cssrules.empty()
    if(on) {
        s='Hiding';
        cssrules.append("div.output_stderr, div[data-mime-type*='.stderr'] { display:none; }");
    }
    e.text(s+' warnings (click to toggle)');
    on=!on;
}).click();
$(element).append(e);
})(true);

  and should_run_async(code)


<IPython.core.display.Javascript object>

In [9]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass

### Tweet Sentiment Analysis Model Set-Up

In [10]:
import pkg_resources
pkg_resources.require("torch==1.11.0")

import torch

import pandas as pd
import numpy as np
from scipy.special import softmax

from time import sleep
import json
import os

from collections import namedtuple
import sqlite3

from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [11]:
pd.set_option('display.max_colwidth', None)

  and should_run_async(code)


In [12]:
#model_type = "finiteautomata/bertweet-base-sentiment-analysis"

model_type = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModelForSequenceClassification.from_pretrained(model_type)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def all_sentiment_scores(tweet):
    """To return the sentiment score of a Tweet as analysed by BERTweet. """
    tokens = tokenizer.encode(tweet, return_tensors='pt')
    result = model(tokens)
    return softmax(result.logits.detach().numpy())
    #return int(torch.argmax(result.logits))-1

In [14]:
def max_sentiment_score(tweet):
    """To return the sentiment score of a Tweet as analysed by BERTweet. """
    return np.argmax(all_sentiment_scores(tweet)) - 1

In [15]:
#Sample Run

all_sentiment_scores("Hello World!")

array([[0.00323558, 0.06057154, 0.9361929 ]], dtype=float32)

In [16]:
#Sample Run

max_sentiment_score("Hello World!")

1

### Spark Processing

In [17]:
TABLE_COUNT = 0
IN_MEM_TABLENAME = "TweetData"
SQLITE_TABLENAME = "reduced_scored_tweets"
OFFSET = 0
TOPIC = "tweets-category-topic"

In [18]:
def check_offset_status():
    global OFFSET

    connection = sqlite3.connect(os.path.join(os.getcwd(), f'../Database/cache.sqlite'))
    cursor = connection.cursor()

    query = f"SELECT offsetval FROM OFFSET_FINDER WHERE topic LIKE ?"

    rows = cursor.execute(query, [TOPIC]).fetchall()

    if rows:
        OFFSET = rows[0][0]
    else:
        insert_query = f"INSERT INTO OFFSET_FINDER VALUES(?, ?)"
        cursor.execute(insert_query, (TOPIC, 0))
        connection.commit()

    print({f"Starting Offset for {TOPIC}": OFFSET})

    cursor.close()
    connection.close()

In [19]:
check_offset_status()

{'Starting Offset for tweets-category-topic': 8698}


In [20]:
# #OFFSET = 0

# df = spark \
#   .readStream \
#   .format("kafka") \
#   .option("kafka.bootstrap.servers", "localhost:9092") \
#   .option("subscribe", TOPIC) \
#   .option("startingOffsets", f""" {{"{TOPIC}":{{"0":{OFFSET}}}}} """) \
#   .load()

# schema_str = "Data STRING"

# df = df.selectExpr("CAST(value AS STRING)")
# df = df.select(from_csv(col("value"),schema_str).alias("Table"))
# df = df.selectExpr("Table.*")
# df.printSchema()
# #option("truncate", "false")

In [21]:
# query = df.writeStream.trigger(processingTime='5 seconds').queryName(f"{IN_MEM_TABLENAME}{TABLE_COUNT}").format('memory').outputMode("append").start()

In [22]:
# spark.sql('SHOW TABLES').show()

In [23]:
# sleep(10)

# tweet_dict_list = []

# value = spark.sql(f"SELECT * FROM {IN_MEM_TABLENAME}{TABLE_COUNT} LIMIT 10").collect()
# for row in value:
#     #print(row)
#     jsonCopy = json.loads(row["Data"])
#     #jsonCopy['score'] = sentiment_score(jsonCopy['tweet'][:135])
#     sentiments = sentiment_score(jsonCopy['tweet'])
                
#     jsonCopy['neg_score'] = sentiments[0][0]
#     jsonCopy['neu_score'] = sentiments[0][1]
#     jsonCopy['pos_score'] = sentiments[0][2]
    
#     tweet_dict_list.append(jsonCopy)
# pdd = pd.DataFrame(tweet_dict_list)

# query.awaitTermination(1)
# pdd.head(10)

In [24]:
# rdd = sc.parallelize(tweet_dict_list)

In [25]:
# %%time
# rdd.map(lambda row: (row['tweet'], row['score'])).toDF().toPandas().head(10)

In [26]:
# newrdd = rdd.map(lambda row: (row['category'], row['date'], row['count'], row['neg_score'], row['neu_score'], row['pos_score']))
# newrdd.collect()

In [27]:
# '''
# Schema for Reduce

# Key:    (Category, Date)
# Value:  (Tweet_Count, Ind_Neg, Ind_Neu, Ind_Pos, Wted_Neg, Wted_Neu, Wted_Pos)
# '''

# nextrdd = newrdd.map(lambda tup: ((tup[0], tup[1]), (tup[2], tup[3], tup[4], tup[5], \
#                                                      tup[2] * tup[3], tup[2] * tup[4], tup[2] * tup[5]))) \
#                 .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1], a[2] + b[2], a[3] + b[3], \
#                                            a[4] + b[4], a[5] + b[5], a[6] + b[6]))
# nextrdd.collect()

In [28]:
#write to db

# connection = sqlite3.connect(os.path.join(os.getcwd(), f'../Database/results.sqlite'))
# cursor = connection.cursor()

# drop_table = f'''
#             DROP TABLE IF EXISTS {SQLITE_TABLENAME};
#             '''

# cursor.execute(drop_table)


# create_table = f'''CREATE TABLE IF NOT EXISTS {SQLITE_TABLENAME} (
#                 category TEXT,
#                 date DATE,
#                 count INTEGER,
#                 ind_neg NUMERIC,
#                 ind_neu NUMERIC, 
#                 ind_pos NUMERIC,
#                 wted_neg NUMERIC,
#                 wted_neu NUMERIC,
#                 wted_pos NUMERIC,
#                 CONSTRAINT uniq_val PRIMARY KEY (category, date)
#                 );
#                 '''

# cursor.execute(create_table)

# insert_records = f'''INSERT INTO {SQLITE_TABLENAME} (category, date, count, ind_neg, ind_neu, ind_pos, wted_neg, wted_neu, wted_pos) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)
#                         ON CONFLICT(category, date) DO 
#                         UPDATE SET count = count + excluded.count,
#                         ind_neg = ind_neg + excluded.ind_neg,
#                         ind_neu = ind_neu + excluded.ind_neu,
#                         ind_pos = ind_pos + excluded.ind_pos,
#                         wted_neg = wted_neg + excluded.wted_neg,
#                         wted_neu = wted_neu + excluded.wted_neu,
#                         wted_pos = wted_pos + excluded.wted_pos
#                         WHERE {SQLITE_TABLENAME}.category LIKE ? AND {SQLITE_TABLENAME}.date LIKE ? '''
    

# contents = []
# for row in nextrdd.collect():
#     contents.append((row[0][0], row[0][1], row[1][0], float(row[1][1]), float(row[1][2]), float(row[1][3]), row[1][4], row[1][5], row[1][6], row[0][0], row[0][1]))
    
# try:
#     cursor.executemany(insert_records, contents)
#     connection.commit()

#     rows = cursor.execute(f"SELECT * FROM {SQLITE_TABLENAME}").fetchall()
#     for row in rows:
#         print(row)
# except sqlite3.Error as error:
#     print({error})
# finally:
#     cursor.close()
#     connection.close()

### Helper Methods 

In [29]:
def init_df_table():
    """To initialize a Spark DataFrame with data ingested from Kafka. """
    
    df = spark \
      .readStream \
      .format("kafka") \
      .option("kafka.bootstrap.servers", "localhost:9092") \
      .option("subscribe", TOPIC) \
      .option("startingOffsets", f""" {{"{TOPIC}":{{"0":{OFFSET}}}}} """) \
      .load()

    schema_str = "Data STRING"

    df = df.selectExpr("CAST(value AS STRING)")
    df = df.select(from_csv(col("value"),schema_str).alias("Table"))
    df = df.selectExpr("Table.*")
    df.printSchema()

    query = df.writeStream \
                .trigger(processingTime='5 seconds') \
                .queryName(f"{IN_MEM_TABLENAME}{TABLE_COUNT}") \
                .format('memory') \
                .outputMode("append") \
                .start()
    
    spark.sql('SHOW TABLES').show()
    return query

In [30]:
def delete_spark_sql_table():
    """To delete existing SparkSQL tables from memory. """
    #vishakan - results2, venky- results3
    connection = sqlite3.connect(os.path.join(os.getcwd(), f'../Database/results2.sqlite'))
    #connection = sqlite3.connect(os.path.join(os.getcwd(), f'../Database/results3.sqlite'))
    cursor = connection.cursor()
    
    drop_table = f'''
            DROP TABLE IF EXISTS {SQLITE_TABLENAME};
            '''

    #cursor.execute(drop_table)


    create_table = f'''CREATE TABLE IF NOT EXISTS {SQLITE_TABLENAME} (
                    category TEXT,
                    date DATE,
                    count INTEGER,
                    ind_neg NUMERIC,
                    ind_neu NUMERIC, 
                    ind_pos NUMERIC,
                    wted_neg NUMERIC,
                    wted_neu NUMERIC,
                    wted_pos NUMERIC,
                    neg_counts INTEGER,
                    neu_counts INTEGER,
                    pos_counts INTEGER,
                    CONSTRAINT uniq_val PRIMARY KEY (category, date)
                    );
                    '''

    cursor.execute(create_table)
    
    cursor.close()
    connection.close()

In [31]:
def write_to_db(rdd):
    """To write a SparkSQL table to permanent storage. """
    
    connection = sqlite3.connect(os.path.join(os.getcwd(), f'../Database/results.sqlite'))
    cursor = connection.cursor()
    
    insert_records = f'''INSERT INTO {SQLITE_TABLENAME} (category, date, count, ind_neg, ind_neu, ind_pos, wted_neg, wted_neu, wted_pos, neg_counts, neu_counts, pos_counts) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        ON CONFLICT(category, date) DO 
                        UPDATE SET count = count + excluded.count,
                        ind_neg = ind_neg + excluded.ind_neg,
                        ind_neu = ind_neu + excluded.ind_neu,
                        ind_pos = ind_pos + excluded.ind_pos,
                        wted_neg = wted_neg + excluded.wted_neg,
                        wted_neu = wted_neu + excluded.wted_neu,
                        wted_pos = wted_pos + excluded.wted_pos,
                        neg_counts = neg_counts + excluded.neg_counts,
                        neu_counts = neu_counts + excluded.neu_counts,
                        pos_counts = pos_counts + excluded.pos_counts
                        WHERE {SQLITE_TABLENAME}.category LIKE ? AND {SQLITE_TABLENAME}.ticker LIKE ? AND {SQLITE_TABLENAME}.date LIKE ?
                    '''

    contents = []
    
    
    for row in rdd.collect():
        contents.append((row[0][0], row[0][1], row[1][0], float(row[1][1]), float(row[1][2]), float(row[1][3]), row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9], row[0][0], row[0][1], row[0][2]))

    try:
        cursor.executemany(insert_records, contents)
        connection.commit()

    except sqlite3.Error as error:
        print({error})
    finally:
        cursor.close()
        connection.close()
        
        
"""
insert_records = f'''INSERT INTO {SQLITE_TABLENAME} (category, date, count, ind_neg, ind_neu, ind_pos, wted_neg, wted_neu, wted_pos, neg_counts, neu_counts, pos_counts) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                        ON CONFLICT(category, date) DO NOTHING'''
                        
contents.append((row[0][0], row[0][1], row[1][0], float(row[1][1]), float(row[1][2]), float(row[1][3]), row[1][4], row[1][5], row[1][6], row[1][7], row[1][8], row[1][9]))
"""

"\ninsert_records = f'''INSERT INTO {SQLITE_TABLENAME} (category, date, count, ind_neg, ind_neu, ind_pos, wted_neg, wted_neu, wted_pos, neg_counts, neu_counts, pos_counts) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\n                        ON CONFLICT(category, date) DO \n                        UPDATE SET count = count + excluded.count,\n                        ind_neg = ind_neg + excluded.ind_neg,\n                        ind_neu = ind_neu + excluded.ind_neu,\n                        ind_pos = ind_pos + excluded.ind_pos,\n                        wted_neg = wted_neg + excluded.wted_neg,\n                        wted_neu = wted_neu + excluded.wted_neu,\n                        wted_pos = wted_pos + excluded.wted_pos,\n                        neg_counts = neg_counts + excluded.neg_counts,\n                        neu_counts = neu_counts + excluded.neu_counts,\n                        pos_counts = pos_counts + excluded.pos_counts\n                        WHERE {SQLITE_TABLENAME}.category

In [32]:
def update_offset_table(exceptionFlag=True):
    """To update the offset values in storage for subsequent data ingestion. """
    
    global OFFSET
    
    connection = sqlite3.connect(os.path.join(os.getcwd(), f'../Database/cache.sqlite'))
    cursor = connection.cursor()

    query = f"UPDATE OFFSET_FINDER SET offsetval = {OFFSET} WHERE topic LIKE ?";
    cursor.execute(query, [TOPIC]);
    connection.commit();
    
    if exceptionFlag:
        query = f"SELECT offsetval FROM OFFSET_FINDER WHERE topic LIKE ?"
        rows = cursor.execute(query, [TOPIC]).fetchall()

        if rows:
            OFFSET = rows[0][0]
        else:
            OFFSET = -1

    print({f"Updated Starting Offset for {TOPIC}": OFFSET})

    cursor.close()
    connection.close()
    
    if exceptionFlag:    
        raise StopExecution

In [33]:
def consumer_call():
    """Consolidated method to handle the Spark processing of data. """
    
    LIMIT_COUNT = 500
    global TABLE_COUNT, OFFSET
    #OFFSET = 0
    TABLE_COUNT = TABLE_COUNT+1
    delete_spark_sql_table()
    
    while True:
        query = init_df_table()
        sleep(10)
        
        value = spark.sql(f"SELECT * FROM {IN_MEM_TABLENAME}{TABLE_COUNT}").collect()
        spark.sql(f"DROP TABLE {IN_MEM_TABLENAME}{TABLE_COUNT}")
        
        TABLE_COUNT = (TABLE_COUNT+1)
        OFFSET += len(value)
        
        total_tweet_count = len(value)
        
        print({"Tweets collected from select query": total_tweet_count})
        
        if(total_tweet_count == 0):
            update_offset_table()
        
        iter_count = 0
        
        while len(value):
            
            tweet_dict_list = []
            
            p_bar = tqdm(enumerate(value[:LIMIT_COUNT]))
            
            for indx, row in p_bar:
                jsonCopy = json.loads(row["Data"])
                #jsonCopy['score'] = sentiment_score(jsonCopy['tweet'][:135])
                sentiments = all_sentiment_scores(jsonCopy['tweet'])
                
                jsonCopy['neg_score'] = sentiments[0][0]
                jsonCopy['neu_score'] = sentiments[0][1]
                jsonCopy['pos_score'] = sentiments[0][2]
                
                max_sentiment = max_sentiment_score(jsonCopy['tweet'])
                
                jsonCopy['is_neg'] = jsonCopy['count'] if max_sentiment == -1 else 0
                jsonCopy['is_neu'] = jsonCopy['count'] if max_sentiment == 0 else 0
                jsonCopy['is_pos'] = jsonCopy['count'] if max_sentiment == 1 else 0    
                
                tweet_dict_list.append(jsonCopy)
                p_bar.set_description(f'Working on "{indx + iter_count*LIMIT_COUNT + 1}/{total_tweet_count}"')
                
            print({"Number of tweet records" : len(tweet_dict_list)})
            print("----------------------------------------------------------------")
            query.awaitTermination(1)

            rdd = sc.parallelize(tweet_dict_list)

            newrdd = rdd.map(lambda row: (row['category'], row['tweetDate'], row['count'], \
                                          row['neg_score'], row['neu_score'], row['pos_score'], \
                                          row['is_neg'], row['is_neu'], row['is_pos']))
            
            newrdd.collect()

            '''
            Schema for Reduce

            Key:    (Category, Date)
            Value:  (Tweet_Count, Ind_Neg, Ind_Neu, Ind_Pos, Wted_Neg, Wted_Neu, Wted_Pos, Is_Neg, Is_Neu, Is_Pos)
            '''
            
            nextrdd = newrdd.map(lambda tup: ((tup[0], tup[1]), (tup[2], tup[3], tup[4], tup[5], \
                                                                 tup[2] * tup[3], tup[2] * tup[4], tup[2] * tup[5], \
                                                                tup[6], tup[7], tup[8]))) \
                            .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1], a[2] + b[2], a[3] + b[3], \
                                                       a[4] + b[4], a[5] + b[5], a[6] + b[6], \
                                                      a[7] + b[7], a[8] + b[8], a[9] + b[9]))
            
            #for row in nextrdd.collect():
                #print(f'({type(row[0][0]), type(row[0][1])}), ({type(row[1][0]), type(row[1][1]), type(row[1][2]), type(row[1][3]), type(row[1][4]), type(row[1][5]), type(row[1][6]), type(row[1][7]), type(row[1][8]), type(row[1][9])})')
            
            nextrdd.collect()

            write_to_db(nextrdd)
            update_offset_table(False)
            
            for i in range(LIMIT_COUNT):
                if(value):
                    value.pop(0)
            
            iter_count += 1   

In [34]:
consumer_call()

root
 |-- Data: string (nullable = true)

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|         |tweetdata1|       true|
+---------+----------+-----------+

{'Tweets collected from select query': 38846}


  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 500}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


0it [00:00, ?it/s]

{'Number of tweet records': 346}
----------------------------------------------------------------


  self._sock = None
  self._sock = None
  self._sock = None


root
 |-- Data: string (nullable = true)

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|         |tweetdata2|       true|
+---------+----------+-----------+

{'Tweets collected from select query': 0}
{'Updated Starting Offset for tweets-category-topic': 47544}


  self._sock = None


**NOTE**: For re-runs of the program with offset > 0,
cell 19 - 24 (cell that takes limited data from IN_MEM_TABLE, till sqlite3 db connection) - comment out fully, 
cell 25, dont call delete_spark_sql_table()
