In [1]:
# !pip install redis

In [2]:
import redis
import pymongo
import mysql.connector
import pandas as pd
import json
import tweepy
import sys
from dotenv import dotenv_values
from datetime import datetime, timezone

In [3]:
config = dotenv_values(".env")  # config = {"USER": "foo", "EMAIL": "foo@example.org"}


# Step 1: Data Collection

In [4]:
tweet_counter = 0
TWEET_MAX = int(config['TWEET_MAX'])
class MyStreamListener(tweepy.StreamListener):
    def __init__(self, api, write_file):
        self.api = api
        self.me = api.me()
        self.write_file = write_file

    def on_status(self, tweet):
        """
        1.extract the username
        """
        global tweet_counter
        tweet_counter += 1
        print("tweet_counter", tweet_counter)
        if tweet_counter <= TWEET_MAX:
            json.dump(tweet._json, self.write_file)
            if tweet_counter + 1 != TWEET_MAX + 1:
                self.write_file.write(',')

        else:
            self.write_file.write(']')
            self.write_file.close()
            print("Reached max allowed tweets:", TWEET_MAX)
            sys.exit(0)

    def on_error(self, status):
        print("Error detected")

def collect_data():
    auth = tweepy.OAuthHandler(config['CONSUMER_KEY'], config['CONSUMER_SECRET'])
    auth.set_access_token(config['ACCESS_TOKEN'], config['ACCESS_TOKEN_SECRET'])

    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    write_file = open("tweet_stream_april11.json", "w")
    write_file.write('[')
    tweets_listener = MyStreamListener(api, write_file)
    stream = tweepy.Stream(api.auth, tweets_listener)
    stream.filter(track=["#sundayvibes", "UFCVegas23", "#WrestleMania"])


# Only run once to collect tweets

In [5]:
# collect_data()

# Step 2: Data Storage

**Set up mysql and mongodb connections**

In [7]:
def setup_mysql():
    properties = {
        'user': config['USER_SQL'],
        'password': config['PASSWORD_SQL'],
        'host': 'localhost',
        'database': 'tweets_db_sql',
        'raise_on_warnings': True,
    }
    conn = mysql.connector.connect(**properties)
    conn.autocommit = True
    cursor = conn.cursor(dictionary = True)
    cursor.execute("SHOW TABLES LIKE 'user';")
    result = cursor.fetchone()
    create_table = """
        CREATE TABLE user 
          ( 
             sql_user_id      VARCHAR(255),
             user_name        VARCHAR(255), 
             screen_name      VARCHAR(255), 
             followers_count  BIGINT, 
             friends_count    BIGINT, 
             listed_count     BIGINT, 
             favourites_count BIGINT, 
             statuses_count   BIGINT, 
             INDEX(screen_name, followers_count)
             

          );
        """
    if result:
        print("MySQL table user exists. Will be dropped and recreated...")
        cursor.execute("DROP TABLE user;")
    cursor.execute(create_table)
    return conn, cursor

In [8]:
sql_conn, sql_cursor = setup_mysql()

MySQL table user exists. Will be dropped and recreated...


In [9]:
client = None
def setup_mongodb():
    global client
    user = config['USER_MONGO']
    password = config['PASSWORD_MONGO']
    conn_string = f"mongodb+srv://{user}:{password}@cluster0.6iqrn.mongodb.net"
    client = pymongo.MongoClient(conn_string)
    dbnames = client.list_database_names()
    if "tweets_db_mongo" in dbnames:
        print("db exists. Will be deleted...")
        client.drop_database("tweets_db_mongo")
    tweets_db_mongo = client["tweets_db_mongo"]
    col_names = tweets_db_mongo.list_collection_names()
    if "tweets_col" in col_names:
        print("Tweets Collection exists. Will be deleted...")
        tweets_db_mongo.tweets_col.drop()
    tweets_col = tweets_db_mongo["tweets_col"]
    return tweets_db_mongo

In [11]:
tweets_db_mongo = setup_mongodb()

db exists. Will be deleted...


**Get twitter data from previous step**

In [12]:
def get_json_data(filename):
    with open(filename, "r") as read_file:
        json_data = json.load(read_file)
    return json_data

In [13]:
new_json_data = get_json_data('tweet_stream_april11.json')

In [15]:
# def insert_mysql(record, sql_cursor):
#     insert_query = """
    
#     INSERT INTO user 
#             ( 
#                         sql_user_id,
#                         sql_tweet_id,
#                         user_name, 
#                         screen_name, 
#                         followers_count, 
#                         friends_count, 
#                         listed_count, 
#                         favourites_count, 
#                         statuses_count 
#             ) 
#             VALUES 
#             ( 
#                         '{}','{}','{}', '{}', {}, {}, {}, {}, {} 
#             );""".format(*record)
#     try:
#         sql_cursor.execute(insert_query)
# except mysql.connector.Error as err:
#   print("Something went wrong: {}".format(err))
    

In [16]:
def insert_mysql(record, sql_cursor):
    insert_query = """
    
    INSERT INTO user 
            ( 
                        sql_user_id,
                        user_name, 
                        screen_name, 
                        followers_count, 
                        friends_count, 
                        listed_count, 
                        favourites_count, 
                        statuses_count 
            ) 
            VALUES 
            ( 
                        '{}','{}','{}', {}, {}, {}, {}, {}
            );""".format(*record)
    try:
        sql_cursor.execute(insert_query)
    except mysql.connector.Error as err:
        print("Something went wrong: {}".format(err))
    

In [17]:
def insert_mongo(document_dict, tweets_db_mongo):
    tweets_db_mongo.tweets_col.insert_one(document_dict)
    

In [21]:
import time
import re


def store_data_mongo_mysql(json_data, sql_conn, sql_cursor, tweets_db_mongo):
    for row in json_data:
        user = row['user']
        input = [user['id_str'], user['name'], user['screen_name'], user['followers_count'], user['friends_count'],
                 user['listed_count'], user['favourites_count'], user['statuses_count']]

        input[1] = re.sub("'", "", input[1])

        hashtags = []

    

        is_retweet = False
        text = row['text']
        for i in row['entities']['hashtags']:
            hashtags.append(i['text'])

        try:
            #try to get retweet text
            if row['text'][0:2] == 'RT':
                is_retweet = True
                retweet_hashtags = []
                retweet_text = row['retweeted_status']['text']

                for i in row['retweeted_status']['entities']['hashtags']:
                    retweet_hashtags.append(i['text'])

            else:
                # is_retweet = False
                retweet_text = None
                retweet_hashtags = None
        except:
            # is_retweet = False
            retweet_text = None
            retweet_hashtags = None

        time = datetime.strftime(datetime.strptime(row['created_at'], '%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')

        document_dict = {"tweet_id": row['id_str'],
                         "created_date": time,
                         "user_id": row['user']['id_str'],
                         "screen_name": row['user']['screen_name'],
                         "followers_count": row['user']['followers_count'],
                         "favorite_count": row['favorite_count'],
                         "original_hash": hashtags,
                         "retweet_hash": retweet_hashtags,
                         "is_retweet": is_retweet,
                         "tweet_text": text,
                         "retweet_text": retweet_text}
        insert_mongo(document_dict, tweets_db_mongo)
        insert_mysql(input, sql_cursor)





In [23]:
# store_data_mongo_mysql(new_json_data, sql_conn, sql_cursor, tweets_db_mongo)

# Create indexes on fields MongoDB

In [34]:
pd.DataFrame(tweets_db_mongo.tweets_col.index_information())

Unnamed: 0,_id_,tweet_id_1,user_id_1,created_date_1,favorite_count_1
v,2,2,2,2,2
key,"[(_id, 1)]","[(tweet_id, 1)]","[(user_id, 1)]","[(created_date, 1)]","[(favorite_count, 1)]"


In [44]:
tweets_db_mongo.tweets_col.create_index("tweet_id")
tweets_db_mongo.tweets_col.create_index("user_id")
tweets_db_mongo.tweets_col.create_index("created_date")
tweets_db_mongo.tweets_col.create_index("followers_count")

'followers_count_1'

In [45]:
pd.DataFrame(tweets_db_mongo.tweets_col.list_indexes())



Unnamed: 0,v,key,name
0,2,{'_id': 1},_id_
1,2,{'tweet_id': 1},tweet_id_1
2,2,{'user_id': 1},user_id_1
3,2,{'created_date': 1},created_date_1
4,2,{'followers_count': 1},followers_count_1


In [42]:
# tweets_db_mongo.tweets_col.drop_index('favorite_count_1')

In [46]:
pd.DataFrame(tweets_db_mongo.tweets_col.index_information())


Unnamed: 0,_id_,tweet_id_1,user_id_1,created_date_1,followers_count_1
v,2,2,2,2,2
key,"[(_id, 1)]","[(tweet_id, 1)]","[(user_id, 1)]","[(created_date, 1)]","[(followers_count, 1)]"


In [39]:
pd.DataFrame(tweets_db_mongo.tweets_col.find({}).limit(10))

Unnamed: 0,_id,tweet_id,created_date,user_id,screen_name,followers_count,favorite_count,original_hash,retweet_hash,is_retweet,tweet_text,retweet_text
0,60900d7311ce695c2f8f5319,1381329407902633984,2021-04-11 19:32:49,3017826134,GenYtakeover,271,0,[WrestleMania],[WrestleMania],True,RT @ROUSEYSHIRAl: When you appear for 35 secon...,When you appear for 35 seconds and the entire ...
1,60900d7311ce695c2f8f531a,1381329408309530626,2021-04-11 19:32:49,7517222,WWE,11230762,0,[WrestleMania],[WrestleMania],True,RT @KalistoWWE: Mi destino esta en tus manos.....,Mi destino esta en tus manos...\n\n#WrestleMan...
2,60900d7311ce695c2f8f531b,1381329409391661062,2021-04-11 19:32:49,611305033,WolfGangChino,1963,0,[WrestleMania],[WrestleMania],True,RT @TripleH: .@sanbenito’s performance at #Wre...,.@sanbenito’s performance at #WrestleMania was...
3,60900d7311ce695c2f8f531c,1381329410452824066,2021-04-11 19:32:49,3128909037,SRTUPodcast,2196,0,[WrestleMania],[WrestleMania],True,RT @SRTUJeff: Last night's #WrestleMania was a...,Last night's #WrestleMania was all about hope....
4,60900d7311ce695c2f8f531d,1381329411908251649,2021-04-11 19:32:50,762686303288762369,NataliLagunes,585,0,[WrestleMania],[WrestleMania],True,RT @TripleH: .@sanbenito’s performance at #Wre...,.@sanbenito’s performance at #WrestleMania was...
5,60900d7311ce695c2f8f531e,1381329411098693641,2021-04-11 19:32:50,699445358556893184,Nickhaddox3,1577,0,[WrestleMania],,False,"@WWE wrestlemania day 2 Haddox boys style, let...",
6,60900d7311ce695c2f8f531f,1381329412117909504,2021-04-11 19:32:50,120355519,_130990,571,0,[WrestleMania],[WrestleMania],True,RT @TripleH: .@sanbenito’s performance at #Wre...,.@sanbenito’s performance at #WrestleMania was...
7,60900d7311ce695c2f8f5320,1381329412168245253,2021-04-11 19:32:50,1150200722111098880,YumenoTsuxuki,740,0,[],[WrestleMania],True,RT @AlexM_talkSPORT: Adam Pearce and Drew Gula...,Adam Pearce and Drew Gulak were assigned to he...
8,60900d7311ce695c2f8f5321,1381329413007155204,2021-04-11 19:32:50,1154775499,oscarcuadradoxx,109,0,[WrestleMania],[],True,RT @EdgeRatedR: 10 years ago today I was force...,10 years ago today I was forced to retire. For...
9,60900d7311ce695c2f8f5322,1381329413615222784,2021-04-11 19:32:50,756722922849308672,rainbowlily1987,80,0,"[SundayVibes, HereComesNiko]",[SundayVibes],True,RT @HereComesNiko: What can be more relaxing t...,What can be more relaxing than swimming in pon...


In [47]:
tweets_db_mongo.tweets_col.find_one()

{'_id': ObjectId('60900d7311ce695c2f8f5319'),
 'tweet_id': '1381329407902633984',
 'created_date': '2021-04-11 19:32:49',
 'user_id': '3017826134',
 'screen_name': 'GenYtakeover',
 'followers_count': 271,
 'favorite_count': 0,
 'original_hash': ['WrestleMania'],
 'retweet_hash': ['WrestleMania'],
 'is_retweet': True,
 'tweet_text': 'RT @ROUSEYSHIRAl: When you appear for 35 seconds and the entire arena chants your name, @itsBayleyWWE. #WrestleMania https://t.co/REISyl57kA',
 'retweet_text': 'When you appear for 35 seconds and the entire arena chants your name, @itsBayleyWWE. #WrestleMania https://t.co/REISyl57kA'}

In [50]:
mongo_query = {"created_date": {"$gte": '2021-04-11 19:32:00', "$lt": '2021-04-11 19:33:00'}}
pd.DataFrame(tweets_db_mongo.tweets_col.find(mongo_query))

Unnamed: 0,_id,tweet_id,created_date,user_id,screen_name,followers_count,favorite_count,original_hash,retweet_hash,is_retweet,tweet_text,retweet_text
0,60900d7311ce695c2f8f5319,1381329407902633984,2021-04-11 19:32:49,3017826134,GenYtakeover,271,0,[WrestleMania],[WrestleMania],True,RT @ROUSEYSHIRAl: When you appear for 35 secon...,When you appear for 35 seconds and the entire ...
1,60900d7311ce695c2f8f531a,1381329408309530626,2021-04-11 19:32:49,7517222,WWE,11230762,0,[WrestleMania],[WrestleMania],True,RT @KalistoWWE: Mi destino esta en tus manos.....,Mi destino esta en tus manos...\n\n#WrestleMan...
2,60900d7311ce695c2f8f531b,1381329409391661062,2021-04-11 19:32:49,611305033,WolfGangChino,1963,0,[WrestleMania],[WrestleMania],True,RT @TripleH: .@sanbenito’s performance at #Wre...,.@sanbenito’s performance at #WrestleMania was...
3,60900d7311ce695c2f8f531c,1381329410452824066,2021-04-11 19:32:49,3128909037,SRTUPodcast,2196,0,[WrestleMania],[WrestleMania],True,RT @SRTUJeff: Last night's #WrestleMania was a...,Last night's #WrestleMania was all about hope....
4,60900d7311ce695c2f8f531d,1381329411908251649,2021-04-11 19:32:50,762686303288762369,NataliLagunes,585,0,[WrestleMania],[WrestleMania],True,RT @TripleH: .@sanbenito’s performance at #Wre...,.@sanbenito’s performance at #WrestleMania was...
5,60900d7311ce695c2f8f531e,1381329411098693641,2021-04-11 19:32:50,699445358556893184,Nickhaddox3,1577,0,[WrestleMania],,False,"@WWE wrestlemania day 2 Haddox boys style, let...",
6,60900d7311ce695c2f8f531f,1381329412117909504,2021-04-11 19:32:50,120355519,_130990,571,0,[WrestleMania],[WrestleMania],True,RT @TripleH: .@sanbenito’s performance at #Wre...,.@sanbenito’s performance at #WrestleMania was...
7,60900d7311ce695c2f8f5320,1381329412168245253,2021-04-11 19:32:50,1150200722111098880,YumenoTsuxuki,740,0,[],[WrestleMania],True,RT @AlexM_talkSPORT: Adam Pearce and Drew Gula...,Adam Pearce and Drew Gulak were assigned to he...
8,60900d7311ce695c2f8f5321,1381329413007155204,2021-04-11 19:32:50,1154775499,oscarcuadradoxx,109,0,[WrestleMania],[],True,RT @EdgeRatedR: 10 years ago today I was force...,10 years ago today I was forced to retire. For...
9,60900d7311ce695c2f8f5322,1381329413615222784,2021-04-11 19:32:50,756722922849308672,rainbowlily1987,80,0,"[SundayVibes, HereComesNiko]",[SundayVibes],True,RT @HereComesNiko: What can be more relaxing t...,What can be more relaxing than swimming in pon...


## Close database connections so can access in Search Application

In [40]:
sql_cursor.close()
sql_conn.close()
client.close()


# Create Redis cache

In [31]:
# user_id = '433658939'
# tweet_id = '1386384669197750272'
# sql_cursor.execute("SELECT * FROM user WHERE sql_user_id = '433658939' AND sql_tweet_id = '1386384669197750272';")
# sql_res = sql_cursor.fetchall()
# # print(sql_res)
# pd.DataFrame(sql_res)


In [32]:
redis_client = redis.Redis(host='localhost', port='6379')

In [33]:
from datetime import timedelta
redis_client.setex(
    'runner',
    timedelta(minutes=1),
    value = str(sql_res)
)

NameError: name 'sql_res' is not defined

In [None]:
redis_client.setex?

In [None]:
redis_client.get('runner')

In [None]:
r = redis.Redis(db=1)

# Search Application