# Search Application

In [13]:
! pip install psycopg2 pymongo fastapi requests



In [14]:
# importing required libraries
import psycopg2
from pymongo import MongoClient
from fastapi import HTTPException
from exceptions.exceptions import *
import requests

## Connecting to the Databases Storing the Information

In [15]:
# connecting to the PostgreSQL database
try:
    p_conn = psycopg2.connect(
        dbname = "twitter",
        user = "varshiniyanamandra",
        password = "",
        host = "localhost",
        port = "5432"
    )
except psycopg2.OperationalError as e:
    # raise an error if the connection is unsuccessful
    print(f"Unable to connect to PostgreSQL: {e}")

# opening a cursor to perform database operations
p_cur = p_conn.cursor()

# print the PostgreSQL server information
print(p_conn.get_dsn_parameters(), "\n")

{'user': 'varshiniyanamandra', 'passfile': '/Users/varshiniyanamandra/.pgpass', 'dbname': 'twitter', 'host': 'localhost', 'port': '5432', 'tty': '', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 



In [21]:
# connect to the MongoDB database
mongo_conn = MongoClient("mongodb+srv://vm574:twitter574@cluster0.nwilsw2.mongodb.net/?retryWrites=true&w=majority")
mongo_db = mongo_conn['twitter_data']
tweets_collection = mongo_db['tweets']

# creating index
tweets_collection.create_index([("text", "text")])

'text_text'

## Building the Search Application

### Defining Utility Functions

In [17]:
# function to get user information
def get_user_info(username):
    """
        This function returns the user information as a JSON object.
        Input:
            username (str): Twitter user ID which we want to look up
        Output:
            user_out (JSON object): user information corresponding to username
    """
    username = str(username)
    
    p_conn = psycopg2.connect(
        dbname = "twitter",
        user = "varshiniyanamandra",
        password = "",
        host = "localhost",
        port = "5432"
    )
    p_cur = p_conn.cursor()
    
    p_cur.execute("SELECT * FROM TwitterUser WHERE screen_name = '{0}';".format(username))
    user_info = p_cur.fetchone()
    if user_info is None:
        # raise an exception if the user doesn't exist in the database
        raise HTTPException(status_code = UserNotFoundError.code, detail = UserNotFoundError.description)
    user_out = {
        'id': user_info[0],
        'name': user_info[1],
        'screen_name': user_info[2],
        'location': user_info[3],
        'created_at': user_info[4],
        'followers_count': user_info[5],
        'friends_count': user_info[6],
        'statuses_count': user_info[7],
        'favorites_count': user_info[8]
    }
    
    p_cur.close()

    return user_out

In [18]:
# example
get_user_info("PawanKalyan")

{'id': 2719753171,
 'name': 'Pawan Kalyan',
 'screen_name': 'PawanKalyan',
 'location': None,
 'created_at': datetime.datetime(2014, 8, 9, 17, 6, 12),
 'followers_count': 3957996,
 'friends_count': 161,
 'statuses_count': 1408,
 'favorites_count': 2}

In [28]:
# function to retreive tweets containing a specified keyword
def retrieve_tweets_keyword(keyword: str, sort_criterion = None):
    """
        Function to get the information of tweets based on a user-specified keyword.
        Input:
            keyword (str): user-specified keyword
            sort_criterion (str): criteria for sorting the results
                default: decreasing order of popularity (favorite count)
                valid inputs:
                    'oldestToNewest', 'newestToOldest', 'popularity'
        Output:
            out (list): list of tweets containing the keyword
    """

    # check if sort_criterion is valid, if specified:
    if sort_criterion is not None:
        if sort_criterion not in ['oldestToNewest', 'newestToOldest', 'popularity']:
            raise HTTPException(status_code = InvalidSortCriterionError.code, detail = InvalidSortCriterionError.description)

    out = []
    query = {'$text': {'$search': keyword}}
    tweets_match = tweets_collection.find(query).limit(10) # we can add .limit(PAGE_LIMIT) here, if needed
    for result in tweets_match:
        tweet = {
            'id': result['_id'],
            'text': result['text'],
            'user_id': result['user_id'],
            'quote_count': result['quote_count'],
            'reply_count': result['reply_count'],
            'retweet_count': result['retweet_count'],
            'favorite_count': result['favorite_count'],
            'created_at': result['timestamp'],
            'coordinates': result['coordinates']
        }
        # add information on whether the tweet is a retweet
        if 'retweet' in result:
            tweet['retweet'] = "Yes"
        else:
            tweet['retweet'] = "No"

        
        out.append(tweet)

    # sort the results from oldest to newest before returning, if specified 'oldestToNewest'
    if sort_criterion == "oldestToNewest":
        out = sorted(out, key = lambda x: int(x['created_at']), reverse = False)
    elif sort_criterion == "newestToOldest":
        # otherwise sort the results from newest to oldest before returning if specified 'newestToOldest'
        out = sorted(out, key = lambda x: int(x['created_at']), reverse = True)
    else:
        # sort the output in the decreasing order of favorites (popularity), by default or if specified 'popularity'
        out = sorted(out, key = lambda x: int(x['favorite_count']), reverse = True)
        
    return out

In [29]:
# example
q = retrieve_tweets_keyword(keyword = "corona", sort_criterion = 'oldestToNewest')
q

[{'id': 1254053756016066561,
  'text': '#FightCoronaNotActivists\nFight Corona not journalists.\nFight Corona not Kashmiris.\nFight Corona not students.\nFight Corona not intellectuals.\nFight Corona not activists.',
  'user_id': 4352044939,
  'quote_count': 0,
  'reply_count': 0,
  'retweet_count': 0,
  'favorite_count': 0,
  'created_at': '1587824689283',
  'coordinates': None,
  'retweet': 'Yes'},
 {'id': 1254054268073652225,
  'text': '#FightCoronaNotActivists\nFight Corona not journalists.\nFight Corona not Kashmiris.\nFight Corona not students.\nFight Corona not intellectuals.\nFight Corona not activists.',
  'user_id': 1214120596184809472,
  'quote_count': 0,
  'reply_count': 0,
  'retweet_count': 0,
  'favorite_count': 0,
  'created_at': '1587824811367',
  'coordinates': None,
  'retweet': 'Yes'},
 {'id': 1254055473034088450,
  'text': '#FightCoronaNotActivists\nFight Corona not journalists.\nFight Corona not Kashmiris.\nFight Corona not students.\nFight Corona not intellectual

In [70]:
# function to search tweets based on tweet id
def retrieve_tweet(tweet_id):
    """
        Function to get the information of a tweet based on a user-specified tweet ID.
        Input:
            tweet_id: user-specified tweet ID
        Output:
            tweet (JSON object): tweet corresponding to tweet_id
    """
    query = {'_id': tweet_id}
    result = tweets_collection.find_one(query)
    if result is None:
        # raise an exception if the tweet doesn't exist in the database
        raise HTTPException(status_code = TweetNotFoundError.code, detail = TweetNotFoundError.description)
    tweet = {
        'id': result['_id'],
        'text': result['text'],
        'user_id': result['user_id'],
        'quote_count': result['quote_count'],
        'reply_count': result['reply_count'],
        'retweet_count': result['retweet_count'],
        'favorite_count': result['favorite_count'],
        'created_at': result['timestamp'],
        'coordinates': result['coordinates']
    }
    # add information on whether the tweet is a retweet
    if 'retweet' in result:
        tweet['retweet'] = "Yes"
    else:
        tweet['retweet'] = "No"

    return tweet

In [71]:
# example
retrieve_tweet(1254053756016066561)

{'id': 1254053756016066561,
 'text': '#FightCoronaNotActivists\nFight Corona not journalists.\nFight Corona not Kashmiris.\nFight Corona not students.\nFight Corona not intellectuals.\nFight Corona not activists.',
 'user_id': 4352044939,
 'quote_count': 0,
 'reply_count': 0,
 'retweet_count': 0,
 'favorite_count': 0,
 'created_at': '1587824689283',
 'coordinates': None,
 'retweet': 'Yes'}

In [87]:
# function to retrieve all tweets by a user
def retrieve_tweets_user(username: str, sort_criterion = 'popularity'):
    """
        Function to retrieve all tweets by a specific user (user-specified username)
        Input:
            username (str): user-specified username
            sort_criterion (str): criteria for sorting the results
                default: decreasing order of popularity (favorite count)
                valid inputs:
                    'oldestToNewest', 'newestToOldest', 'popularity'
        Output:
            tweets_list (list): list of tweets made by a user
    """
    
    p_conn = psycopg2.connect(
        dbname = "twitter",
        user = "varshiniyanamandra",
        password = "",
        host = "localhost",
        port = "5432"
    )
    p_cur = p_conn.cursor()
    
    # check if the user id is valid
    p_cur.execute("SELECT * FROM TwitterUser WHERE screen_name = '{0}';".format(username))
    username_db = p_cur.fetchone()
    if username_db is None:
        # raise an exception if the user doesn't exist in the database
        raise HTTPException(status_code = UserNotFoundError.code, detail = UserNotFoundError.description)
    user_id = username_db[0]
    
    p_cur.close()

    # if the user exists, proceed to search MongoDB
    query = {'user_id': user_id}
    tweets_match = tweets_collection.find(query)
    
    if tweets_match is None:
        return "This user has not tweeted anything yet."

    tweets_list = []
    for result in tweets_match:
        tweet = {
            'id': result['_id'],
            'text': result['text'],
            'user_id': result['user_id'],
            'quote_count': result['quote_count'],
            'reply_count': result['reply_count'],
            'retweet_count': result['retweet_count'],
            'favorite_count': result['favorite_count'],
            'created_at': result['timestamp'],
            'coordinates': result['coordinates']
        }
        # add information on whether the tweet is a retweet
        if 'retweet' in result:
            tweet['retweet'] = "Yes"
        else:
            tweet['retweet'] = "No"

        tweets_list.append(tweet)

    # sort the results from oldest to newest before returning, if specified 'oldestToNewest'
    if sort_criterion == "oldestToNewest":
        tweets_list = sorted(tweets_list, key = lambda x: int(x['created_at']), reverse = False)
    elif sort_criterion == "newestToOldest":
        # otherwise sort the results from newest to oldest before returning if specified 'newestToOldest'
        tweets_list = sorted(tweets_list, key = lambda x: int(x['created_at']), reverse = True)
    else:
        # sort the output in the decreasing order of favorites (popularity), by default or if specified 'popularity'
        tweets_list = sorted(tweets_list, key = lambda x: int(x['favorite_count']), reverse = True)
        
    return tweets_list

In [90]:
# example
retrieve_tweets_user("kaajalActress")

[{'id': 1254053756016066561,
  'text': '#FightCoronaNotActivists\nFight Corona not journalists.\nFight Corona not Kashmiris.\nFight Corona not students.\nFight Corona not intellectuals.\nFight Corona not activists.',
  'user_id': 4352044939,
  'quote_count': 0,
  'reply_count': 0,
  'retweet_count': 0,
  'favorite_count': 0,
  'created_at': '1587824689283',
  'coordinates': None,
  'retweet': 'Yes'}]

In [93]:
# function to retreive the screen name from the user_id
def retreive_screen_name(user_id):
    """
        Function to retrieve tweets near a specified location.
        Input:
            user_id: user-specified user ID
        Output:
            username (str): username corresponding to the specified user_id
    """
    p_conn = psycopg2.connect(
        dbname = "twitter",
        user = "varshiniyanamandra",
        password = "",
        host = "localhost",
        port = "5432"
    )
    p_cur = p_conn.cursor()
    
    # check if the user id is valid
    p_cur.execute("SELECT screen_name FROM TwitterUser WHERE id = '{0}';".format(user_id))
    username_db = p_cur.fetchone()
    if username_db is None:
        # raise an exception if the user doesn't exist in the database
        raise HTTPException(status_code = UserNotFoundError.code, detail = UserNotFoundError.description)
    username = username_db[0]
    
    p_cur.close()
    
    return username

In [94]:
# example
retreive_screen_name(4352044939)

'kaajalActress'

In [105]:
# function to retrieve tweets based on location
def retrieve_tweets_location(location: str, distance = 100000, sort_criterion = 'popularity'):
    """
        Function to retrieve tweets near a specified location.
        Input:
            location (str): user-specified location
            distance (int): radius of the search (100 kilometers, by default)
            sort_criterion (str): criteria for sorting the results
                default: decreasing order of popularity (favorite count)
                valid inputs:
                    'oldestToNewest', 'newestToOldest', 'popularity'
        Output:
            tweets_list (list): list of tweets made from within the radius of the specified location
    """
    # getting the latitude and longitude of the location specified
    endpoint = "https://nominatim.openstreetmap.org/search"
    params = {"q": location, "format": "json", "limit": 1}
    # sending a request of the Nominatim API
    response = requests.get(endpoint, params=params)
    result = response.json()[0]
    # getting the latitude and longitude from the response of the API
    latitude = float(result["lat"])
    longitude = float(result["lon"])

    # creating a geospatial index on the coordinates field
    tweets_collection.create_index([("coordinates", "2dsphere")])
    
    query = {"coordinates": {"$near": {"$geometry": {"type": "Point", "coordinates": [longitude, latitude]}, "$maxDistance": distance}}}
    tweets_match = tweets_collection.find(query)
    
    if tweets_match is None:
        return "There are no tweets near this location yet."

    tweets_list = []
    for result in tweets_match:
        tweet = {
            'id': result['_id'],
            'text': result['text'],
            'user_id': result['user_id'],
            'quote_count': result['quote_count'],
            'reply_count': result['reply_count'],
            'retweet_count': result['retweet_count'],
            'favorite_count': result['favorite_count'],
            'created_at': result['timestamp'],
            'coordinates': result['coordinates']
        }
        # add information on whether the tweet is a retweet
        if 'retweet' in result:
            tweet['retweet'] = "Yes"
        else:
            tweet['retweet'] = "No"

        tweets_list.append(tweet)

    # sort the results from oldest to newest before returning, if specified 'oldestToNewest'
    if sort_criterion == "oldestToNewest":
        tweets_list = sorted(tweets_list, key = lambda x: int(x['created_at']), reverse = False)
    elif sort_criterion == "newestToOldest":
        # otherwise sort the results from newest to oldest before returning if specified 'newestToOldest'
        tweets_list = sorted(tweets_list, key = lambda x: int(x['created_at']), reverse = True)
    else:
        # sort the output in the decreasing order of favorites (popularity), by default or if specified 'popularity'
        tweets_list = sorted(tweets_list, key = lambda x: int(x['favorite_count']), reverse = True)
        
    return tweets_list

In [106]:
# example
retrieve_tweets_location("New York City")

[{'id': 1254042742868459520,
  'text': 'Again, thank you for supporting Miz Magickal! Your orders are shipping this week.   All Miz Magickal products are prepared and packaged with sanitary practices in place as always even before this corona virus!  Be… https://t.co/souHAFw3Pn',
  'user_id': 860598230396141569,
  'quote_count': 0,
  'reply_count': 0,
  'retweet_count': 0,
  'favorite_count': 0,
  'created_at': '1587822063544',
  'coordinates': {'type': 'Point', 'coordinates': [-73.9259, 40.7722]},
  'retweet': 'No'}]

## Main Search Function

### Search functions currently supported:
- get_user_info(username)
- retrieve_tweets_keyword(keyword: str, sort_criterion = None)
- retrieve_tweet(tweet_id)
- retrieve_tweets_user(username: str, sort_criterion = 'popularity')
- retreive_screen_name(user_id)
- retrieve_tweets_location(location: str, distance = 100000, sort_criterion = 'popularity')

In [None]:
def search(username = None, username_tweets = None, user_id = None, tweet_id = None, keyword = None, location = None, sort_criterion = 'popularity', distance = 100000):
    params = [username, username_tweets, user_id, tweet_id, keyword, location]
    # raise exception if no search parameters are specified
    if all(x is None for x in params):
        raise HTTPException(status_code = NoParametersGivenError.code, detail = NoParametersGivenError.description)
    # raise exception if too many search parameters are specified
    if params.count(None) < len(params) - 1:
        raise HTTPException(status_code = TooManyParametersGivenError.code, detail = TooManyParametersGivenError.description)
    
    if username is not None:
        return get_user_info(username)
    elif username_tweets is not None:
        return retrieve_tweets_user(username_tweets, sort_criterion)
    elif user_id is not None:
        return retreive_screen_name(user_id)
    elif tweet_id is not None:
        return retrieve_tweet(tweet_id)
    elif keyword is not None:
        return retrieve_tweets_keyword(keyword, sort_criterion)
    else:
        return retrieve_tweets_location(location, distance, sort_criterion)