# Collect Tweets into MongoDB Cluster

## Import Python libraries

In [None]:
import pymongo
from pymongo import MongoClient
import json
import tweepy
import twitter
from pprint import pprint
import configparser

##  Load the Authorization Info

Save all the secretes or tokens in a config.ini file and use the configparse to load the authorization info. 

In [None]:
config = configparser.ConfigParser()
config.read('config.ini')

CONSUMER_KEY      = config['mytwitter']['api_key']
CONSUMER_SECRET   = config['mytwitter']['api_secrete']
OAUTH_TOKEN       = config['mytwitter']['access_token']
OATH_TOKEN_SECRET = config['mytwitter']['access_secrete']

mongod_connect = config['mymongo']['connection']

## Connect to the MongoDB Cluster

In [None]:
client = MongoClient(mongod_connect)
db = client.tweet_db # create a database named tweet_db
tweet_collection = db.tweet_collection #create a collection called tweet_collection
tweet_collection.create_index([("id", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique

## Use the Streaming API to Collect Tweets

Authorize the Stream API 

In [None]:
stream_auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
stream_auth.set_access_token(OAUTH_TOKEN, OATH_TOKEN_SECRET)

strem_api = tweepy.API(stream_auth)

Define the query for the Stream API

In [None]:
track = ['jmu'] # define the keywords, tweets contain JMU

locations = [-78.9326449,38.4150904,-78.8816972,38.4450731] #defin the location, in Harrisonburg, VA

The collected tweets will contain 'JMU' <span style="color:red;font-weight:bold"> OR </span> are located in Harrisonburg, VA

In [None]:
class MyStreamListener(tweepy.StreamListener):
  
    def on_status(self, status):
        pprint(statuse['created_at'])# print the date of the collected tweets
        try:
            tweet_collection.insert_one(status._json)
        except:
            pass
  
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False
        
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = strem_api.auth, listener=myStreamListener)
myStream.filter(track=track, locations = locations)

## Use the REST API to Collect Tweets

Authorize the REST API 

In [None]:
rest_auth = twitter.oauth.OAuth(OAUTH_TOKEN,OATH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
rest_api = twitter.Twitter(auth=rest_auth)

Define the query for the REST API

In [None]:
count = 10
geocode = "38.4392897,-78.9412224,50mi"
q = "jmu"

The collected tweets will contain 'JMU' <span style="color:red;font-weight:bold"> AND </span> are located in Harrisonburg, VA

In [None]:
search_results = rest_api.search.tweets( count=count,q=q, geocode=geocode)
         
statuses = search_results["statuses"]


since_id_new = statuses[-1]['id']

for statuse in statuses:
   
    try:
        tweet_collection.insert_one(statuse)
        pprint(statuse['created_at'])# print the date of the collected tweets
  
    except:
        pass
        

Continue fetching early tweets with the same query. 
<p><span style="color:red;font-weight:bold">YOU WILL REACH YOUR RATE LIMIT VERY FAST</span></p>

In [None]:
since_id_old = 0
while(since_id_new != since_id_old):
    since_id_old = since_id_new
    search_results = rest_api.search.tweets( count=count,q=q,geocode=geocode, max_id= since_id_new)
    statuses = search_results["statuses"]

    since_id_new = statuses[-1]['id']

    for statuse in statuses:
                
        try:
            tweet_collection.insert_one(statuse)
            pprint(statuse['created_at']) # print the date of the collected tweets
        except:
            pass

## Print the Collected Tweets

In [None]:
print(tweet_collection.estimated_document_count())# number of tweets collected

user_cursor = tweet_collection.distinct("user.id")
print (len(user_cursor)) # number of unique Twitter users 

Print the Tweets 

In [None]:
tweet_cursor = tweet_collection.find()
for document in tweet_cursor:
    try:
        print ('----')
#         pprint (document) # use pprint to print the entire tweet document
   
        print ('name:', document["user"]["name"]) # user name
        print ('text:', document["text"])         # tweets
    except:
        print ("***error in encoding")
        pass