# Social Media Analytics -- Twitter Scraping

Get your Twitter API

API: Application Program Interface

API is a set of programming instructions and protocol for accessing the data available through online companies such as Twitter, Facebook, Youtube, Google, and many others (New York Times)
it is called a software-to-software interface.

# Tweepy

**install the library**

!pip install tweepy

https://anaconda.org/conda-forge/tweepy

In [1]:
import tweepy
import pandas as pd

In [2]:
# put your credential API information here (do not share your API to the public)
# API Consumer Key and Secret from Twitter
api_key = "Put your api key"
api_secret = "Put your api secret"

access_token = "put your access token"
access_token_secret = "put your access token secret"

# authentication
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth,wait_on_rate_limit=True)

## Example 1: search tweets with specific keywords or usernames

In [3]:
for status in tweepy.Cursor(api.search_tweets,q="KStateBusiness",
                            until='2022-09-27', #Twitter will automatically sample the last 7 days of data, and only allows you to get 7-day data
                            result_type='recent',
                            include_entities=True,
                            tweet_mode='extended', #otherwise it only captures 140 characters
                            lang="en").items(5):
    print(type(status), status) #please do not print all of the contents to the lab assignment submission, this is just for an illustration in class

<class 'tweepy.models.Status'> Status(_api=<tweepy.api.API object at 0x000001ED41D108E0>, _json={'created_at': 'Fri Sep 23 20:03:09 +0000 2022', 'id': 1573402612644413497, 'id_str': '1573402612644413497', 'full_text': 'RT @KStateBusiness: Shout out to @kstatecareer for bringing outstanding opportunities to our business students! At the All-University Caree…', 'truncated': False, 'display_text_range': [0, 140], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'KStateBusiness', 'name': 'K-State Business', 'id': 162129701, 'id_str': '162129701', 'indices': [3, 18]}, {'screen_name': 'kstatecareer', 'name': 'K-State Career Center', 'id': 26574823, 'id_str': '26574823', 'indices': [33, 46]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_r

In [4]:
dir(status)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_api',
 '_json',
 'author',
 'contributors',
 'coordinates',
 'created_at',
 'destroy',
 'display_text_range',
 'entities',
 'extended_entities',
 'favorite',
 'favorite_count',
 'favorited',
 'full_text',
 'geo',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'is_quote_status',
 'lang',
 'metadata',
 'parse',
 'parse_list',
 'place',
 'possibly_sensitive',
 'retweet',
 'retweet_count',
 'retweeted',
 'retweets',
 'source',
 'source_url',
 'truncated',
 'user']

``From the above status json data``:

_json={'created_at': 'Fri Sep 23 20:03:09 +0000 2022', 'id': 1573402612644413497, 'id_str': '1573402612644413497', 'full_text': 'RT @KStateBusiness: Shout out to @kstatecareer for bringing outstanding opportunities to our business students! At the All-University Caree…', 'truncated': False, 'display_text_range': [0, 140], 


In [5]:
for status in tweepy.Cursor(api.search_tweets,q="KStateBusiness",
                            until='2022-09-27', #Twitter will automatically sample the last 7 days of data, and only allows you to get 7-day data
                            result_type='recent',
                            include_entities=True,
                            tweet_mode='extended', #otherwise it only captures 140 characters
                            lang="en").items(5):
    
    post_time = status.created_at # tweets posting time
    tweet_id = status.id_str # gets the tweets thread ID
    tweet = status.full_text # gets the tweets texts

    print(post_time, tweet_id, tweet)

2022-09-23 20:03:09+00:00 1573402612644413497 RT @KStateBusiness: Shout out to @kstatecareer for bringing outstanding opportunities to our business students! At the All-University Caree…
2022-09-23 17:49:18+00:00 1573368926716796928 RT @KStateBusiness: Shout out to @kstatecareer for bringing outstanding opportunities to our business students! At the All-University Caree…
2022-09-23 17:06:36+00:00 1573358179580350464 Shout out to @kstatecareer for bringing outstanding opportunities to our business students! At the All-University Career Fair held this week, students from all disciplines were given the chance to meet with more than 300 employers! https://t.co/hBGcsHP1pg
2022-09-22 23:05:08+00:00 1573086019897135104 RT @KStateBusiness: Congratulations and welcome to our 2022-23 Menard Family Scholars! 👏 👉 https://t.co/FEsBf3XUyK https://t.co/JABgWAleWI
2022-09-22 15:01:30+00:00 1572964310808203266 Congratulations and welcome to our 2022-23 Menard Family Scholars! 👏 👉 https://t.co/FEsBf3XUy

### Prettify the Twitter data then get more data

In [6]:
import json

In [7]:
for status in tweepy.Cursor(api.search_tweets,q="KStateBusiness",
                            until='2022-09-27', #Twitter will automatically sample the last 7 days of data, and only allows you to get 7-day data
                            result_type='recent',
                            include_entities=True,
                            tweet_mode='extended', #otherwise it only captures 140 characters
                            lang="en").items(5):
    
    data = status._json #only query the json data from status
    print(json.dumps(data, indent=4)) #please do not print all of the contents to the lab assignment submission, this is just for an illustration in class

{
    "created_at": "Fri Sep 23 20:03:09 +0000 2022",
    "id": 1573402612644413497,
    "id_str": "1573402612644413497",
    "full_text": "RT @KStateBusiness: Shout out to @kstatecareer for bringing outstanding opportunities to our business students! At the All-University Caree\u2026",
    "truncated": false,
    "display_text_range": [
        0,
        140
    ],
    "entities": {
        "hashtags": [],
        "symbols": [],
        "user_mentions": [
            {
                "screen_name": "KStateBusiness",
                "name": "K-State Business",
                "id": 162129701,
                "id_str": "162129701",
                "indices": [
                    3,
                    18
                ]
            },
            {
                "screen_name": "kstatecareer",
                "name": "K-State Career Center",
                "id": 26574823,
                "id_str": "26574823",
                "indices": [
                    33,
                  

### Store the json data into a list

In [8]:
# Get entire JSON File and store in a list called tweets_data[]
# create an empty list to store our tweets in
tweets_data = []

for status in tweepy.Cursor(api.search_tweets,q="KStateBusiness",
                            until='2022-09-27', #Twitter will automatically sample the last 7 days of data, and only allows you to get 7-day data
                            result_type='recent',
                            include_entities=True,
                            tweet_mode='extended', #otherwise it only captures 140 characters
                            lang="en").items(5):
    
    data = status._json #only query the json data from status
    json_clean = json.dumps(data, indent=4) #convert the dictionary to string #formatted json data
    tweets_data.append(json.loads(json_clean)) # store the json data into list tweets_data[]
    
# lets see how many we got
print(len(tweets_data))

5


### Store the list into a pandas dataframe

In [9]:
alltweets = pd.DataFrame(tweets_data)
alltweets

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,metadata,source,in_reply_to_status_id,...,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive
0,Fri Sep 23 20:03:09 +0000 2022,1573402612644413497,1573402612644413497,RT @KStateBusiness: Shout out to @kstatecareer...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,...,,{'created_at': 'Fri Sep 23 17:06:36 +0000 2022...,False,2,0,False,False,en,,
1,Fri Sep 23 17:49:18 +0000 2022,1573368926716796928,1573368926716796928,RT @KStateBusiness: Shout out to @kstatecareer...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,...,,{'created_at': 'Fri Sep 23 17:06:36 +0000 2022...,False,2,0,False,False,en,,
2,Fri Sep 23 17:06:36 +0000 2022,1573358179580350464,1573358179580350464,Shout out to @kstatecareer for bringing outsta...,False,"[0, 231]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,,,False,2,6,False,False,en,"{'media': [{'id': 1573358140590112769, 'id_str...",False
3,Thu Sep 22 23:05:08 +0000 2022,1573086019897135104,1573086019897135104,RT @KStateBusiness: Congratulations and welcom...,False,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/iphone"" r...",,...,,{'created_at': 'Thu Sep 22 15:01:30 +0000 2022...,False,1,0,False,False,en,"{'media': [{'id': 1572964261302870016, 'id_str...",False
4,Thu Sep 22 15:01:30 +0000 2022,1572964310808203266,1572964310808203266,Congratulations and welcome to our 2022-23 Men...,False,"[0, 94]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,,,False,1,5,False,False,en,"{'media': [{'id': 1572964261302870016, 'id_str...",False


In [26]:
#display the full texts
pd.options.display.max_colwidth = 500

In [11]:
alltweets.to_csv("allsample1.csv", index=False)

### Find more data

In [12]:
for status in tweepy.Cursor(api.search_tweets,q="KStateBusiness",
                            until='2022-09-27', #Twitter will automatically sample the last 7 days of data, and only allows you to get 7-day data
                            result_type='recent',
                            include_entities=True,
                            tweet_mode='extended', #otherwise it only captures 140 characters
                            lang="en").items(5):
    
    userID = status.user.id_str
    userName = status.user.name
    screenName = status.user.screen_name
    userLocation = status.user.location
    user_descrip = status.user.description
    user_url = status.user.url
    followers = status.user.followers_count
    friends = status.user.friends_count
    user_ctime = status.user.created_at #user's account creating time
    source = status.source #iPhone or Android or Web

    print(userID, userName,screenName,userLocation,user_descrip,user_url,followers, friends, user_ctime, source)

26574823 K-State Career Center kstatecareer Manhattan, KS The Career Center facilitates career readiness for K-State students by providing career exploration and employment opportunities. https://t.co/Ij6mpw4mnF 2542 409 2009-03-25 19:51:23+00:00 Twitter for iPhone
303515624 Craig Schmidling ElCraig31 Kansas  None 152 703 2011-05-23 00:50:50+00:00 Twitter for Android
162129701 K-State Business KStateBusiness Manhattan, KS The K-State College of Business Administration offers a dynamic curriculum guided by business executives throughout the country. https://t.co/0FEZ1as4wo 3185 455 2010-07-02 19:57:20+00:00 Twitter Web App
1885174638 Patrick Graham Grahamtcld   None 104 1249 2013-09-20 03:21:24+00:00 Twitter for iPhone
162129701 K-State Business KStateBusiness Manhattan, KS The K-State College of Business Administration offers a dynamic curriculum guided by business executives throughout the country. https://t.co/0FEZ1as4wo 3185 455 2010-07-02 19:57:20+00:00 Twitter Web App


### Put them together and store the data into Pandas dataframe (Full scraper for Twitter search)

In [13]:
tweets = [] #initialize an empty list

for status in tweepy.Cursor(api.search_tweets,q="KStateBusiness",
                            until='2022-09-27', #Twitter will automatically sample the last 7 days of data, and only allows you to get 7-day data
                            result_type='recent',
                            include_entities=True,
                            tweet_mode='extended', #otherwise it only captures 140 characters
                            lang="en").items(5):
    
    post_time = status.created_at # tweets posting time
    tweet_id = status.id_str # gets the tweets thread ID
    tweet = status.full_text # gets the tweets texts
    userID = status.user.id_str
    userName = status.user.name
    screenName = status.user.screen_name
    userLocation = status.user.location
    user_descrip = status.user.description
    user_url = status.user.url
    followers = status.user.followers_count
    friends = status.user.friends_count
    user_ctime = status.user.created_at #user's account creating time
    source = status.source #iPhone or Android
    
    tweets.append((userName, screenName, userID, userLocation, user_descrip, user_url, followers, friends, user_ctime, source, 
                   post_time, tweet_id, tweet))

headers = ['userName', 'screenName', 'userID', 'userLocation', 'user_descrip', 'user_url', 'followers', 'friends', 'user_ctime',
           'source', 'post_time', 'tweet_id', 'tweet_text']

tweet_sample = pd.DataFrame(tweets,columns = headers)
tweet_sample.head()

Unnamed: 0,userName,screenName,userID,userLocation,user_descrip,user_url,followers,friends,user_ctime,source,post_time,tweet_id,tweet_text
0,K-State Career Center,kstatecareer,26574823,"Manhattan, KS",The Career Center facilitates career readiness...,https://t.co/Ij6mpw4mnF,2542,409,2009-03-25 19:51:23+00:00,Twitter for iPhone,2022-09-23 20:03:09+00:00,1573402612644413497,RT @KStateBusiness: Shout out to @kstatecareer...
1,Craig Schmidling,ElCraig31,303515624,Kansas,,,152,703,2011-05-23 00:50:50+00:00,Twitter for Android,2022-09-23 17:49:18+00:00,1573368926716796928,RT @KStateBusiness: Shout out to @kstatecareer...
2,K-State Business,KStateBusiness,162129701,"Manhattan, KS",The K-State College of Business Administration...,https://t.co/0FEZ1as4wo,3185,455,2010-07-02 19:57:20+00:00,Twitter Web App,2022-09-23 17:06:36+00:00,1573358179580350464,Shout out to @kstatecareer for bringing outsta...
3,Patrick Graham,Grahamtcld,1885174638,,,,104,1249,2013-09-20 03:21:24+00:00,Twitter for iPhone,2022-09-22 23:05:08+00:00,1573086019897135104,RT @KStateBusiness: Congratulations and welcom...
4,K-State Business,KStateBusiness,162129701,"Manhattan, KS",The K-State College of Business Administration...,https://t.co/0FEZ1as4wo,3185,455,2010-07-02 19:57:20+00:00,Twitter Web App,2022-09-22 15:01:30+00:00,1572964310808203266,Congratulations and welcome to our 2022-23 Men...


In [24]:
tweet_sample.to_csv("tweetsample.csv")

### Action 1: Collect Tweets (at least 200 tweets) from any Topic interesting to you using the example, and store it into pandas dataframe.

1. only show the first five rows (using .head())
2. add two new data/variables besides the example includes, such as verification info ('verified'), the number of retweets ('retweet_count')

## Example 2: get tweets from a specific user's page

In [14]:
# Get data from your own Twitter page (from timeline)
for myself in tweepy.Cursor(api.home_timeline).items(10):
    print(myself) #please do not print all of the contents to the lab assignment submission, this is just for an illustration in class

Status(_api=<tweepy.api.API object at 0x000001ED41D108E0>, _json={'created_at': 'Tue Sep 27 03:50:07 +0000 2022', 'id': 1574607291348709377, 'id_str': '1574607291348709377', 'text': 'A proposed $120 million Near South Side high school is moving closer to reality, with the Chicago Board of Educatio… https://t.co/V9cLn35Gpy', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/V9cLn35Gpy', 'expanded_url': 'https://twitter.com/i/web/status/1574607291348709377', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'source': '<a href="https://help.twitter.com/en/using-twitter/how-to-tweet#source-labels" rel="nofollow">SocialFlow app</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 7313362, 'id_str': '7313362', 'name': 'Chicago Tribune', 'screen_name': 'chicagotribune', 'location'

In [16]:
# Get data from the friends (the users we are following)
friends = []
for friend in tweepy.Cursor(api.get_friends).items(): #https://github.com/tweepy/tweepy/releases/tag/v4.0.0
    jdata = friend._json 
    cleanj = json.dumps(jdata, indent=4) 
    friends.append(json.loads(cleanj)) 
    
fridf = pd.DataFrame(friends)
fridf.head()

Unnamed: 0,id,id_str,name,screen_name,location,description,url,entities,protected,followers_count,...,default_profile_image,following,live_following,follow_request_sent,notifications,muting,blocking,blocked_by,translator_type,withheld_in_countries
0,1493613046790787072,1493613046790787072,Onitama,OnitamaNFT,ONITAMA Land,Discord: https://t.co/83ysYq2qam \nArtist: @re...,https://t.co/MnjzbVtZNc,{'url': {'urls': [{'url': 'https://t.co/MnjzbV...,False,11522,...,False,True,False,False,False,False,False,False,none,[]
1,44196397,44196397,Elon Musk,elonmusk,,,,{'description': {'urls': []}},False,107153080,...,False,True,False,False,False,False,False,False,none,[]
2,107145333,107145333,University of Houston,UHouston,"Houston, Texas, USA","Founded in 1927, the University of Houston is ...",https://t.co/9Oaf3hKrNG,{'url': {'urls': [{'url': 'https://t.co/9Oaf3h...,False,226313,...,False,True,False,False,False,False,False,False,none,[]
3,1282418324228337665,1282418324228337665,WSB Mod,wsbmod,wsbmod.eth,💎🤲💎🤲 💎🤲 💎🤲 💎🤲 💎🤲 💎🤲,https://t.co/hAW7y0Ugmw,{'url': {'urls': [{'url': 'https://t.co/hAW7y0...,False,794926,...,False,True,False,False,False,False,False,False,none,[]
4,30354991,30354991,Kamala Harris,KamalaHarris,"Washington, DC","Fighting for the people. Wife, Momala, Auntie....",https://t.co/uBcfgtTRsi,{'url': {'urls': [{'url': 'https://t.co/uBcfgt...,False,20131214,...,False,True,False,False,False,False,False,False,none,[]


## Action 2: get data from your friends to Pandas Dataframe (at least 20 tweets)

1. get at least 20 variables (columns), such as tweets text ('text'), url, followers_count 

In [14]:
# Get data from other users' pages
KSBS = []

Kstate_B = api.user_timeline(screen_name='KStateBusiness', #https://twitter.com/KStateBusiness
                           count=100, # 200 is the maximum allowed count
                           tweet_mode = 'extended'
                             # Necessary to keep full_text, otherwise only the first 140 words are extracted
                           )

for KSBS_tweet in Kstate_B[:5]: #extracting 5 latest tweets
    jKSBS = KSBS_tweet._json 
    cleanj_KSBS = json.dumps(jKSBS, indent=4) 
    KSBS.append(json.loads(cleanj_KSBS)) 
    
KSBSdf = pd.DataFrame(KSBS)
KSBSdf.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,...,quoted_status_permalink,quoted_status,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,extended_entities,retweeted_status
0,Sun Sep 25 04:19:50 +0000 2022,1573889994099499009,1573889994099499009,#catsmeanbusiness @MartinezTheQB https://t.co/...,False,"[0, 32]","{'hashtags': [{'text': 'catsmeanbusiness', 'in...","<a href=""http://twitter.com/download/iphone"" r...",,,...,"{'url': 'https://t.co/PKlrVZvQuK', 'expanded':...",{'created_at': 'Sun Sep 25 03:20:07 +0000 2022...,0,4,False,False,False,qme,,
1,Fri Sep 23 17:06:36 +0000 2022,1573358179580350464,1573358179580350464,Shout out to @kstatecareer for bringing outsta...,False,"[0, 231]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,,,2,6,False,False,False,en,"{'media': [{'id': 1573358140590112769, 'id_str...",
2,Thu Sep 22 15:01:30 +0000 2022,1572964310808203266,1572964310808203266,Congratulations and welcome to our 2022-23 Men...,False,"[0, 94]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,,,1,5,False,False,False,en,"{'media': [{'id': 1572964261302870016, 'id_str...",
3,Wed Sep 21 12:16:23 +0000 2022,1572560371541766152,1572560371541766152,RT @KSUBizDean: Celebrating 53 College of Busi...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,1,0,False,False,,en,,{'created_at': 'Wed Sep 21 00:17:41 +0000 2022...
4,Tue Sep 20 21:25:09 +0000 2022,1572336083928752134,1572336083928752134,"September is a busy month of networking, and m...",False,"[0, 284]","{'hashtags': [{'text': 'catsmeanbusiness', 'in...","<a href=""https://sproutsocial.com"" rel=""nofoll...",,,...,,,0,1,False,False,False,en,"{'media': [{'id': 1572336048432189440, 'id_str...",


## Action 3: get data from a celebrity's Twitter page to Pandas Dataframe (at least 20 tweets)

1. get at least 20 variables (columns).