# Collect Tweets into MongoDB

## Install Python libraries

You may need to restart your Jupyter Notebook instance after installed those libraries.

In [1]:
!pip install pymongo



In [2]:
!pip install pymongo[srv]



In [3]:
!pip install dnspython



In [4]:
!pip install tweepy



In [5]:
!pip install twitter



In [6]:
pip install certifi

Note: you may need to restart the kernel to use updated packages.


## Import Python libraries

In [1]:
import pymongo
from pymongo import MongoClient
import json
import tweepy
import twitter
from pprint import pprint
import configparser
import pandas as pd
import certifi

##  Load the Authorization Info

Save database connection info and API Keys in a config.ini file and use the configparse to load the authorization info. 

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')

CONSUMER_KEY      = config['mytwitter']['api_key']
CONSUMER_SECRET   = config['mytwitter']['api_secrete']
OAUTH_TOKEN       = config['mytwitter']['access_token']
OATH_TOKEN_SECRET = config['mytwitter']['access_secrete']

mongod_connect = config['mymongo']['connection']

## Connect to the MongoDB Cluster

In [3]:
ca = certifi.where()
client = MongoClient(mongod_connect , tlsCAFile=ca) # connect to mongo and fix SSL problem with tlsCAFile=ca
db = client.Twitter # use or create a database named Twitter
tweet_collection = db.tweet_collection #use or create a collection named tweet_collection
tweet_collection.create_index([("id", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique

'id_1'

## Use the Streaming API to Collect Tweets

Authorize the Stream API 

In [10]:
stream_auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
stream_auth.set_access_token(OAUTH_TOKEN, OATH_TOKEN_SECRET)

strem_api = tweepy.API(stream_auth)

Define the query for the Stream API

In [14]:
track = ['#ม็อบ12ธันวา64'] # define the keywords, tweets contain election

# locations = [-78.9326449,38.4150904,-78.8816972,38.4450731] #defin the location, in Harrisonburg, VA

The collected tweets will contain 'election' <span style="color:red;font-weight:bold"> OR </span> are located in Harrisonburg, VA

In [15]:
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print (status.id_str)
        try:
            tweet_collection.insert_one(status._json)
        except:
            pass
  
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = strem_api.auth, listener=myStreamListener)
myStream.filter(track=track)#  (locations = locations)   #Use either track or locations

1470572249409220608
1470572256887640064
1470572277062201348
1470572279910191108
1470572300172890114
1470572302806884353
1470572307902984194
1470572310734114818
1470572315301662720
1470572320108720131
1470572321249558529
1470572344355672069
1470572345290989571
1470572347266453506
1470572347727880193
1470572349103611905
1470572356640796674
1470572394834055180
1470572394829860866
1470572411468726277
1470572428237500418
1470572448592531461
1470572449687162887
1470572468775522304
1470572471653134343
1470572525965168640
1470572531190943751
1470572539239829507
1470572553580150784
1470572571871514633
1470572575851876356
1470572581992349698
1470572584815058946
1470572588124364801
1470572616008171520
1470572634316308480


KeyboardInterrupt: 

## Use the REST API to Collect Tweets

Authorize the REST API 

In [4]:
rest_auth = twitter.oauth.OAuth(OAUTH_TOKEN,OATH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
rest_api = twitter.Twitter(auth=rest_auth)

Define the query for the REST API

In [5]:
count = 10 #number of returned tweets, default and max is 100
geocode = "38.4392897,-78.9412224,50mi"  # defin the location, in Harrisonburg, VA
q = "#ผู้ว่ากทม"                            #define the keywords, tweets contain election

The collected tweets will contain '#ผู้ว่ากทม' <span style="color:red;font-weight:bold"> AND </span> are located in Harrisonburg, VA

In [78]:
#search_results = rest_api.search.tweets( count=count,q=q, geocode=geocode)
search_results = rest_api.search.tweets( count=count,q=q) #you can use both q and geocode
statuses = search_results["statuses"]
since_id_new = statuses[-1]['id']
i=1
for statuse in statuses:
    try:
        tweet_collection.insert_one(statuse)
        pprint(statuse['created_at'])# print the date of the collected tweets
        i=i+1
    except:
        pass
print(i, "documents insert")

'Tue Dec 14 02:30:37 +0000 2021'
'Tue Dec 14 02:30:36 +0000 2021'
'Tue Dec 14 02:30:36 +0000 2021'
'Tue Dec 14 02:30:32 +0000 2021'
'Tue Dec 14 02:30:26 +0000 2021'
'Tue Dec 14 02:30:25 +0000 2021'
'Tue Dec 14 02:30:24 +0000 2021'
'Tue Dec 14 02:30:21 +0000 2021'
'Tue Dec 14 02:30:21 +0000 2021'
'Tue Dec 14 02:30:21 +0000 2021'
'Tue Dec 14 02:30:19 +0000 2021'
'Tue Dec 14 02:30:19 +0000 2021'
'Tue Dec 14 02:30:12 +0000 2021'
'Tue Dec 14 02:30:08 +0000 2021'
'Tue Dec 14 02:30:06 +0000 2021'
'Tue Dec 14 02:30:05 +0000 2021'
'Tue Dec 14 02:30:02 +0000 2021'
'Tue Dec 14 02:29:49 +0000 2021'
'Tue Dec 14 02:29:48 +0000 2021'
'Tue Dec 14 02:29:48 +0000 2021'
'Tue Dec 14 02:29:45 +0000 2021'
'Tue Dec 14 02:29:44 +0000 2021'
'Tue Dec 14 02:29:43 +0000 2021'
'Tue Dec 14 02:29:42 +0000 2021'
'Tue Dec 14 02:29:42 +0000 2021'
'Tue Dec 14 02:29:41 +0000 2021'
'Tue Dec 14 02:29:36 +0000 2021'
'Tue Dec 14 02:29:35 +0000 2021'
'Tue Dec 14 02:29:33 +0000 2021'
'Tue Dec 14 02:29:31 +0000 2021'
'Tue Dec 1

In [75]:
print("you have ",tweet_collection.count_documents({}),"document in collection")

you have  214 document in collection


In [53]:
print(since_id_new)

1470578305942179843


## View the Collected Tweets

Print the number of tweets and unique twitter users

In [56]:
print(tweet_collection.estimated_document_count())# number of tweets collected

user_cursor = tweet_collection.distinct("user.id")
print (len(user_cursor)) # number of unique Twitter users 

214
163


## ลองสร้าง loop เอง

Continue fetching early tweets with the same query. 
<p><span style="color:red;font-weight:bold">YOU WILL REACH YOUR RATE LIMIT VERY FAST</span></p>

In [81]:
start = tweet_collection.count_documents({}) # จำนวนคอลเลกชั่นที่มีในปัจจุบัน
count = 1000 #number of returned tweets, default and max is 1000
q = "#ผู้ว่ากทม"                            #define the keywords, tweets contain election
print("start = ",start)

start =  314


In [82]:
since_id_old = 0
since_id_new = start+count
i=1

while(since_id_new != since_id_old):
    search_results = rest_api.search.tweets( count=count,q=q) #you can use both q and geocode
    statuses = search_results["statuses"]
    since_id_new = statuses[-1]['id']
    for statuse in statuses:
        try:
            tweet_collection.insert_one(statuse)
            print(i, "documents insert")
            i=i+1
        except:
            pass

1 documents insert
2 documents insert
3 documents insert
4 documents insert
5 documents insert
6 documents insert
7 documents insert
8 documents insert
9 documents insert
10 documents insert
11 documents insert
12 documents insert
13 documents insert
14 documents insert
15 documents insert
16 documents insert
17 documents insert
18 documents insert
19 documents insert
20 documents insert
21 documents insert
22 documents insert
23 documents insert
24 documents insert
25 documents insert
26 documents insert
27 documents insert
28 documents insert
29 documents insert
30 documents insert
31 documents insert
32 documents insert
33 documents insert
34 documents insert
35 documents insert
36 documents insert
37 documents insert
38 documents insert
39 documents insert
40 documents insert
41 documents insert
42 documents insert
43 documents insert
44 documents insert
45 documents insert
46 documents insert
47 documents insert
48 documents insert
49 documents insert
50 documents insert
51 docume

398 documents insert
399 documents insert
400 documents insert
401 documents insert
402 documents insert
403 documents insert
404 documents insert
405 documents insert
406 documents insert
407 documents insert
408 documents insert
409 documents insert
410 documents insert
411 documents insert
412 documents insert
413 documents insert
414 documents insert
415 documents insert
416 documents insert
417 documents insert
418 documents insert
419 documents insert
420 documents insert
421 documents insert
422 documents insert
423 documents insert
424 documents insert
425 documents insert
426 documents insert
427 documents insert
428 documents insert
429 documents insert
430 documents insert
431 documents insert
432 documents insert
433 documents insert
434 documents insert
435 documents insert
436 documents insert
437 documents insert
438 documents insert
439 documents insert
440 documents insert
441 documents insert
442 documents insert
443 documents insert
444 documents insert
445 documents

791 documents insert
792 documents insert
793 documents insert
794 documents insert
795 documents insert
796 documents insert
797 documents insert
798 documents insert
799 documents insert
800 documents insert
801 documents insert
802 documents insert
803 documents insert
804 documents insert
805 documents insert
806 documents insert
807 documents insert
808 documents insert
809 documents insert
810 documents insert
811 documents insert
812 documents insert
813 documents insert
814 documents insert
815 documents insert
816 documents insert
817 documents insert
818 documents insert
819 documents insert
820 documents insert
821 documents insert
822 documents insert
823 documents insert
824 documents insert
825 documents insert
826 documents insert
827 documents insert
828 documents insert
829 documents insert
830 documents insert
831 documents insert
832 documents insert
833 documents insert
834 documents insert
835 documents insert
836 documents insert
837 documents insert
838 documents

TwitterError: Incomplete JSON data collected for 1.1/search/tweets (count=1000&oauth_consumer_key=OxrKX0tyCgKnJUVa4f31j2GQI&oauth_nonce=3825267853466788441&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1639451773&oauth_token=3313439336-HruZfbLk7sg1JBXfrQpHsLvulJkrTI7b1qX8J34&oauth_version=1.0&q=%23%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B8%A7%E0%B9%88%E0%B8%B2%E0%B8%81%E0%B8%97%E0%B8%A1&oauth_signature=EpjYtFIOISuW56QnLxv4gCUTmv4%3D): Unterminated string starting at: line 1 column 701020 (char 701019))

In [85]:
start = tweet_collection.count_documents({}) # จำนวนคอลเลกชั่นที่มีในปัจจุบัน
count = 1000 #number of returned tweets, default and max is 1000
q = "#ผู้ว่ากทม"                            #define the keywords, tweets contain election
print("now",start)

now 1260


In [12]:
id = 1470577854387666945
  
# fetching the status
status = strem_api.get_status(id)
  
# fetching the text attribute
text = status.text 
  
print("The text of the status is : \n\n" + text)

The text of the status is : 

RT @RadioKaKaKa: และก็ ไม่มีที่ไหนในโลกที่บอกว่า
ปฎิรูป=ล้มล้าง
#saveจะนะ #ผู้ว่ากทม #แอนชิลี https://t.co/DHx3o7rhxi
