# Collect Tweets into MongoDB

## Install Python libraries

You may need to restart your Jupyter Notebook instance after installed those libraries.

In [1]:
!pip install pymongo



In [2]:
!pip install pymongo[srv]



In [3]:
!pip install dnspython



In [4]:
!pip install tweepy



In [5]:
!pip install twitter



In [6]:
pip install certifi

Note: you may need to restart the kernel to use updated packages.


## Import Python libraries

In [1]:
import pymongo
from pymongo import MongoClient
import json
import tweepy
import twitter
from pprint import pprint
import configparser
import pandas as pd
import certifi

##  Load the Authorization Info

Save database connection info and API Keys in a config.ini file and use the configparse to load the authorization info. 

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')

CONSUMER_KEY      = config['mytwitter']['api_key']
CONSUMER_SECRET   = config['mytwitter']['api_secrete']
OAUTH_TOKEN       = config['mytwitter']['access_token']
OATH_TOKEN_SECRET = config['mytwitter']['access_secrete']

mongod_connect = config['mymongo']['connection']

## Connect to the MongoDB Cluster

In [3]:
ca = certifi.where()
client = MongoClient(mongod_connect , tlsCAFile=ca) # connect to mongo and fix SSL problem with tlsCAFile=ca
db = client.Twitter # use or create a database named Twitter
tweet_collection = db.tweet_collection #use or create a collection named tweet_collection
tweet_collection.create_index([("id", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique

'id_1'

## Use the Streaming API to Collect Tweets

Authorize the Stream API 

In [20]:
stream_auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
stream_auth.set_access_token(OAUTH_TOKEN, OATH_TOKEN_SECRET)

strem_api = tweepy.API(stream_auth)

Define the query for the Stream API

In [14]:
track = ['#ม็อบ12ธันวา64'] # define the keywords, tweets contain election

# locations = [-78.9326449,38.4150904,-78.8816972,38.4450731] #defin the location, in Harrisonburg, VA

The collected tweets will contain 'election' <span style="color:red;font-weight:bold"> OR </span> are located in Harrisonburg, VA

In [15]:
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print (status.id_str)
        try:
            tweet_collection.insert_one(status._json)
        except:
            pass
  
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False
        
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = strem_api.auth, listener=myStreamListener)
myStream.filter(track=track)#  (locations = locations)   #Use either track or locations

1470572249409220608
1470572256887640064
1470572277062201348
1470572279910191108
1470572300172890114
1470572302806884353
1470572307902984194
1470572310734114818
1470572315301662720
1470572320108720131
1470572321249558529
1470572344355672069
1470572345290989571
1470572347266453506
1470572347727880193
1470572349103611905
1470572356640796674
1470572394834055180
1470572394829860866
1470572411468726277
1470572428237500418
1470572448592531461
1470572449687162887
1470572468775522304
1470572471653134343
1470572525965168640
1470572531190943751
1470572539239829507
1470572553580150784
1470572571871514633
1470572575851876356
1470572581992349698
1470572584815058946
1470572588124364801
1470572616008171520
1470572634316308480


KeyboardInterrupt: 

## Use the REST API to Collect Tweets

Authorize the REST API 

In [4]:
rest_auth = twitter.oauth.OAuth(OAUTH_TOKEN,OATH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
rest_api = twitter.Twitter(auth=rest_auth)

Define the query for the REST API

In [5]:
count = 10 #number of returned tweets, default and max is 100
geocode = "38.4392897,-78.9412224,50mi"  # defin the location, in Harrisonburg, VA
q = "#ผู้ว่ากทม"                            #define the keywords, tweets contain election

The collected tweets will contain '#ผู้ว่ากทม' <span style="color:red;font-weight:bold"> AND </span> are located in Harrisonburg, VA

In [27]:
#search_results = rest_api.search.tweets( count=count,q=q, geocode=geocode)
search_results = rest_api.search.tweets( count=count,q=q) #you can use both q and geocode
statuses = search_results["statuses"]
since_id_new = statuses[-1]['id']
i=1
for statuse in statuses:
    try:
#         tweet_collection.insert_one(statuse)
        print(statuse['created_at'])# print the date of the collected tweets
        print(statuse['id_str'])
        i=i+1
    except:
        pass
print(i, "documents insert")

Tue Dec 14 15:12:36 +0000 2021
1470773727017291786
Tue Dec 14 15:12:36 +0000 2021
1470773724920037376
Tue Dec 14 15:12:35 +0000 2021
1470773721623412739
Tue Dec 14 15:12:34 +0000 2021
1470773718557532160
Tue Dec 14 15:12:32 +0000 2021
1470773710659493897
Tue Dec 14 15:12:32 +0000 2021
1470773708272906247
Tue Dec 14 15:12:31 +0000 2021
1470773705328541713
Tue Dec 14 15:12:31 +0000 2021
1470773704468619264
Tue Dec 14 15:12:30 +0000 2021
1470773701918482446
Tue Dec 14 15:12:30 +0000 2021
1470773699531915264
Tue Dec 14 15:12:28 +0000 2021
1470773693311766543
Tue Dec 14 15:12:28 +0000 2021
1470773693299240966
Tue Dec 14 15:12:28 +0000 2021
1470773693236359170
Tue Dec 14 15:12:25 +0000 2021
1470773677797036037
Tue Dec 14 15:12:24 +0000 2021
1470773676698226696
Tue Dec 14 15:12:24 +0000 2021
1470773676022902789
Tue Dec 14 15:12:22 +0000 2021
1470773668737388551
Tue Dec 14 15:12:21 +0000 2021
1470773663192567809
Tue Dec 14 15:12:20 +0000 2021
1470773660147462145
Tue Dec 14 15:12:20 +0000 2021


In [75]:
print("you have ",tweet_collection.count_documents({}),"document in collection")

you have  214 document in collection


In [53]:
print(since_id_new)

1470578305942179843


## View the Collected Tweets

Print the number of tweets and unique twitter users

In [56]:
print(tweet_collection.estimated_document_count())# number of tweets collected

user_cursor = tweet_collection.distinct("user.id")
print (len(user_cursor)) # number of unique Twitter users 

214
163


## ลองสร้าง loop เอง

Continue fetching early tweets with the same query. 
<p><span style="color:red;font-weight:bold">YOU WILL REACH YOUR RATE LIMIT VERY FAST</span></p>

In [81]:
start = tweet_collection.count_documents({}) # จำนวนคอลเลกชั่นที่มีในปัจจุบัน
count = 1000 #number of returned tweets, default and max is 1000
q = "#ผู้ว่ากทม"                            #define the keywords, tweets contain election
print("start = ",start)

start =  314


In [28]:
since_id_old = 0
since_id_new = start+count
i=1

while(since_id_new != since_id_old):
    search_results = rest_api.search.tweets( count=10,q=q) #you can use both q and geocode
    statuses = search_results["statuses"]
    since_id_new = statuses[-1]['id']
    for statuse in statuses:
        try:
            tweet_collection.insert_one(statuse)
            print(i, "documents insert")
            print(since_id_new)
            i=i+1
        except:
            pass

1 documents insert
1470775258483466245
2 documents insert
1470775258483466245
3 documents insert
1470775258483466245
4 documents insert
1470775258483466245
5 documents insert
1470775258483466245
6 documents insert
1470775258483466245
7 documents insert
1470775258483466245
8 documents insert
1470775258483466245
9 documents insert
1470775258483466245
10 documents insert
1470775258483466245
11 documents insert
1470775387378622472
12 documents insert
1470775387378622472
13 documents insert
1470775387378622472
14 documents insert
1470775387378622472
15 documents insert
1470775387378622472
16 documents insert
1470775387378622472
17 documents insert
1470775387378622472
18 documents insert
1470775387378622472
19 documents insert
1470775387378622472
20 documents insert
1470775387378622472
21 documents insert
1470775471763857410
22 documents insert
1470775471763857410
23 documents insert
1470775471763857410
24 documents insert
1470775471763857410
25 documents insert
1470775471763857410
26 docume

204 documents insert
1470777067696492563
205 documents insert
1470777067696492563
206 documents insert
1470777067696492563
207 documents insert
1470777067696492563
208 documents insert
1470777067696492563
209 documents insert
1470777067696492563
210 documents insert
1470777067696492563
211 documents insert
1470777126408400910
212 documents insert
1470777126408400910
213 documents insert
1470777126408400910
214 documents insert
1470777126408400910
215 documents insert
1470777126408400910
216 documents insert
1470777126408400910
217 documents insert
1470777126408400910
218 documents insert
1470777126408400910
219 documents insert
1470777126408400910
220 documents insert
1470777126408400910
221 documents insert
1470777242553044995
222 documents insert
1470777242553044995
223 documents insert
1470777242553044995
224 documents insert
1470777242553044995
225 documents insert
1470777242553044995
226 documents insert
1470777242553044995
227 documents insert
1470777242553044995
228 documents in

TwitterHTTPError: Twitter sent status 429 for URL: 1.1/search/tweets.json using parameters: (count=10&oauth_consumer_key=OxrKX0tyCgKnJUVa4f31j2GQI&oauth_nonce=1726837508055940163&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1639495620&oauth_token=3313439336-HruZfbLk7sg1JBXfrQpHsLvulJkrTI7b1qX8J34&oauth_version=1.0&q=%23%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B8%A7%E0%B9%88%E0%B8%B2%E0%B8%81%E0%B8%97%E0%B8%A1&oauth_signature=W0B94ys9oz9g4RgIEhY1yKPF%2BHE%3D)
details: {'errors': [{'message': 'Rate limit exceeded', 'code': 88}]}

In [85]:
start = tweet_collection.count_documents({}) # จำนวนคอลเลกชั่นที่มีในปัจจุบัน
count = 1000 #number of returned tweets, default and max is 1000
q = "#ผู้ว่ากทม"                            #define the keywords, tweets contain election
print("now",start)

now 1260


In [23]:
id = 1470350066682515459
  
# fetching the status
status = strem_api.get_status(id, tweet_mode = "extended")
  
# fetching the text attribute
full_text  = status.full_text 
  
print("The text of the status is : \n\n" + full_text )

The text of the status is : 

ตั้งแต่ปี 47 จนถึงปัจจุบัน ระยะเวลากว่า 17 ปี ผู้ว่าล้วนมาจากพรรคประชาธิปัตย์ อัศวิน ก็อดีตคนของประชาธิปัตย์ ที่ถูกแต่งตั้งจากเผด็จการ 
ปัญหาต่างๆที่ #เอ้สุชัชวีร์ พูดถึง คือ ตอกย้ำความล้มเหลวของพรรคนี้อย่างชัดเจน คือ จะอาสามาแก้ปัญหาที่พรรคนี้ทำไว้ว่างั้น 
#ผู้ว่ากทม  #เราทำได้ https://t.co/04sApTF334


## อ๊ากกกกกกก

In [24]:
start = tweet_collection.count_documents({}) # จำนวนคอลเลกชั่นที่มีในปัจจุบัน
count = 10                                 # number of returned tweets
q = "#ผู้ว่ากทม"                                # define the keywords, tweets contain election
print("start = ",start)

start =  1260


In [25]:
since_id_old = 0
since_id_new = start+count
i=1

while(since_id_new != since_id_old):
    search_results = rest_api.search.tweets( count=count,q=q) #you can use both q and geocode
    statuses = search_results["statuses"]
    since_id_new = statuses[-1]['id']
    for statuse in statuses:
        try:
            #tweet_collection.insert_one(statuse)
            print(statuse.id_str)
            print(i, "documents insert")
            i=i+1
        except:
            pass

TwitterError: Incomplete JSON data collected for 1.1/search/tweets (count=1000&oauth_consumer_key=OxrKX0tyCgKnJUVa4f31j2GQI&oauth_nonce=13682801957679807979&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1639492976&oauth_token=3313439336-HruZfbLk7sg1JBXfrQpHsLvulJkrTI7b1qX8J34&oauth_version=1.0&q=%23%E0%B8%9C%E0%B8%B9%E0%B9%89%E0%B8%A7%E0%B9%88%E0%B8%B2%E0%B8%81%E0%B8%97%E0%B8%A1&oauth_signature=rARDnRNQKCZGGeJoqyZOk0S70Gk%3D): Invalid \uXXXX escape: line 1 column 619385 (char 619384))