In [None]:
"""
Title: Twitter Data Mining and Sentinent Analysis in Python
Author: Thomas J. Weinandy
Date: 11 November 2019
Version: 1.0
Availability: https://github.com/tomweinandy/twitter_data_mining

Table of Contents:
 1) Import essential functions, credentials for accessing Twitter API
 2) Define auxiliaray functions
 3) Define process_tweet()
 4) Create csv file
 5) State search criteria
 6) Run search and save results to csv
"""

In [1]:
""" 1) Import essential functions, credentials for accessing Twitter API """

from twython import Twython  
import pandas as pd
import numpy as np
import twitter_credentials

api = Twython(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)

In [2]:
""" 2) Define auxiliary functions """

def get_text_search(tweet):       
    # The text of tweets >140 characters is truncated, so this retreives the full text
    # Try for extended text of an original tweet, if RT'd (REST API)
    try: text = tweet['retweeted_status']['full_text']
    except:
        # Try for extended text of an original tweet (REST API)
        try: text = tweet['full_text']
        except:
            # Try for basic text of original tweet if RT'd 
            try: text = tweet['retweeted_status']['text']
            except:
                # Try for basic text of an original tweet
                try: text = tweet['text']
                except: 
                    # Nothing left to check for
                    text = ''
    return text

def get_reply_id(tweet):
    try: return tweet['in_reply_to_status_id']
    except: return ''
    
def get_rt_id(tweet):
    try: return tweet['retweeted_status']['id']
    except: return ''

sources = ['Android','iPhone','iPad','Web App','Facebook','TweetDeck','Hootsuite','Web Client','Media Studio','Ads Composer','SocialFlow','Sprout Social','Sprinklr']           
def sourcery(tweet):
    for source in sources:
        if source in tweet:
            return source
    else:
        return 'Other'

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob 

def sentiment1(tweet_text):
    return TextBlob(tweet_text).sentiment.polarity

def subjectivity1(tweet_text):
    return TextBlob(tweet_text).sentiment.subjectivity

def sentiment2(tweet_text):
    return SentimentIntensityAnalyzer().polarity_scores(tweet_text)['compound']

In [3]:
""" 3) Define process_tweet() """

def process_tweet(tweet):  
    d = {}
    text = get_text_search(tweet)
    d['id'] = tweet['id']
    d['created_at'] = tweet['created_at']
    d['screen_name'] = tweet['user']['screen_name']
    d['followers_count'] = tweet['user']['followers_count']
    d['friends_count'] = tweet['user']['friends_count']
    d['statuses_count'] = tweet['user']['statuses_count']
    d['user_created_at'] = tweet['user']['created_at']
    d['location'] = tweet['user']['location']
    d['lang'] = tweet['lang']
    d['source'] = sourcery(tweet['source'])
    d['in_reply_to_status_id'] = get_reply_id(tweet)
    d['retweeted_status'] = get_rt_id(tweet) 
    d['favorite_count'] = tweet['favorite_count']
    d['retweet_count'] = tweet['retweet_count']
    d['sentiment1'] = sentiment1(text)
    d['sentiment2'] = sentiment2(text)
    d['subjectivity1'] = subjectivity1(text)
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['bio'] = tweet['user']['description']
    d['text'] = text
    return d

In [9]:
""" 4) Create csv file """
# Create headers (only occurs if file exists)
import os.path
import csv

my_file = 'tweet_data.csv'

# os.remove(my_file)      ### remove comment to delete old file and create a new one (then comment out to prevent accidential deletion) ###

headers = ['id','created_at','screen_name','followers_count','friends_count','statuses_count','user_created_at','location','lang','source','in_reply_to_status_id','retweeted_status','favorite_count','retweet_count','sentiment1','sentiment2','subjectivity1','hashtags','bio','text'] #,'hashtags'
if os.path.exists(my_file)==False:
    with open(my_file, 'x') as outcsv:
        writer = csv.writer(outcsv)
        writer.writerow(headers)

In [10]:
""" 5) State search criteria """

QUERY = 'Big Data OR #bigdata'
SAVE_RATE = 1
MAX_ATTEMPTS = 1000
COUNT_OF_TWEETS_TO_BE_FETCHED = 1000

In [11]:
""" 6) Run search and save results to csv """
import time
from datetime import datetime

print('Start searching according to query:',QUERY)
j = -1

for i in range(0,MAX_ATTEMPTS):
    tweets_len = len(pd.read_csv(my_file))

    if(COUNT_OF_TWEETS_TO_BE_FETCHED < tweets_len):    # check if this should be <=
        break 

    # STEP 1: Query Twitter
    if(0 == i):
        # Query twitter for data. 
        results = api.search(q=QUERY,count='100',lang='en',tweet_mode='extended')
    else:
        # After the first call we should have max_id from result of previous call. Pass it in query.
        results = api.search(q=QUERY,count='100',lang='en',tweet_mode='extended',max_id=next_max_id)

    # STEP 2: Save the returned tweets according to the SAVE_RATE
    j += 1
    if float(j*SAVE_RATE).is_integer()==True:
        for result in results['statuses']:
            tw = process_tweet(result)
            twist=[]

            for key in list(tw):
                twist.append(tw[key])

            with open(my_file, 'a') as file:
                writer = csv.writer(file)
                writer.writerow(twist)

    # STEP 3: Get the next max_id
    try:        
        # Parse the data returned to get max_id to be passed in consequent call.
        next_results_url_params = results['search_metadata']['next_results']
        next_max_id = next_results_url_params.split('max_id=')[1].split('&')[0]

        # prints status and adds time according to Twitter rate limits https://developer.twitter.com/en/docs/basics/rate-limits  
        if float(j*SAVE_RATE/10).is_integer()==True:
            print('Tweets:',tweets_len,'At:',datetime.now(),'Search id:',next_max_id,'From:',tw['created_at'],'')
        time.sleep(5)

    except:
        # No more next pages.
        break

print('Stopped at',datetime.now(),'on next_max_id',next_max_id,'with tweet',tw['id'],'from',tw['created_at'])
beep = lambda x: os.system("echo -n '\a';sleep 0.2;" * x)
beep(3)   # Play alert upon completion

Start searching according to query: Big Data OR #bigdata
Tweets: 0 At: 2019-11-11 07:52:50.639640 Search id: 1193912806468411393 From: Mon Nov 11 15:26:09 +0000 2019 
Tweets: 998 At: 2019-11-11 07:54:02.292344 Search id: 1193848643805073407 From: Mon Nov 11 11:11:11 +0000 2019 
Stopped at 2019-11-11 07:54:07.315924 on next_max_id 1193848643805073407 with tweet 1193848643805073408 from Mon Nov 11 11:11:11 +0000 2019


0

In [8]:
# some sample tweets for testing

t = {'created_at': 'Fri Oct 18 00:59:13 +0000 2019', 'id': 1184997328572342272, 'id_str': '1184997328572342272', 'text': '$spy bears start to realize that they made a huge mistake this year', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 856607164533809155, 'id_str': '856607164533809155', 'name': 'WallSt Prophet', 'screen_name': 'TheProphetGod', 'location': None, 'url': None, 'description': None, 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 428, 'friends_count': 241, 'listed_count': 6, 'favourites_count': 2509, 'statuses_count': 6115, 'created_at': 'Mon Apr 24 20:33:57 +0000 2017', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1013926572946948096/x6mzBdV1_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1013926572946948096/x6mzBdV1_normal.jpg', 'default_profile': True, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': {'hashtags': [], 'urls': [], 'user_mentions': [], 'symbols': [{'text': 'spy', 'indices': [0, 4]}]}, 'favorited': False, 'retweeted': False, 'filter_level': 'low', 'lang': 'en', 'timestamp_ms': '1571360353974'}
t_rt = {'created_at': 'Mon Oct 28 17:50:01 +0000 2019', 'id': 1188875583461253123, 'id_str': '1188875583461253123', 'full_text': "RT @NatashaBertrand: Gen. Milley, chairman of the Joint Chiefs of Staff, tells reporters he doesn't know where Trump got his information th…", 'truncated': False, 'display_text_range': [0, 140], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'NatashaBertrand', 'name': 'Natasha Bertrand', 'id': 372536101, 'id_str': '372536101', 'indices': [3, 19]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1008894580622876672, 'id_str': '1008894580622876672', 'name': 'Isi Esca', 'screen_name': 'EscaIsi', 'location': '', 'description': '', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 21, 'friends_count': 382, 'listed_count': 0, 'created_at': 'Tue Jun 19 02:09:47 +0000 2018', 'favourites_count': 404, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 216, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': None, 'profile_background_image_url_https': None, 'profile_background_tile': False, 'profile_image_url': 'http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png', 'profile_image_url_https': 'https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png', 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': True, 'default_profile_image': True, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'retweeted_status': {'created_at': 'Mon Oct 28 17:25:59 +0000 2019', 'id': 1188869533076992000, 'id_str': '1188869533076992000', 'full_text': 'Gen. Milley, chairman of the Joint Chiefs of Staff, tells reporters he doesn\'t know where Trump got his information that Baghdadi was "whimpering and crying" when he died. "I don\'t know what the source of that was."', 'truncated': False, 'display_text_range': [0, 215], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 372536101, 'id_str': '372536101', 'name': 'Natasha Bertrand', 'screen_name': 'NatashaBertrand', 'location': 'Washington, DC', 'description': 'National security correspondent @politico. @MSNBC contributor. Send tips: nbertrand@politico.com nbertrand@protonmail.com. DM for Signal.', 'url': 'https://t.co/e0ItgsD6Co', 'entities': {'url': {'urls': [{'url': 'https://t.co/e0ItgsD6Co', 'expanded_url': 'https://www.politico.com/staff/natasha-bertrand', 'display_url': 'politico.com/staff/natasha-…', 'indices': [0, 23]}]}, 'description': {'urls': []}}, 'protected': False, 'followers_count': 542153, 'friends_count': 4809, 'listed_count': 5303, 'created_at': 'Mon Sep 12 23:21:14 +0000 2011', 'favourites_count': 32855, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': True, 'statuses_count': 21721, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C6E2EE', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme2/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme2/bg.gif', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1156185168916492293/yDeuvV4S_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1156185168916492293/yDeuvV4S_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/372536101/1566394716', 'profile_link_color': '1F97C7', 'profile_sidebar_border_color': 'C6E2EE', 'profile_sidebar_fill_color': 'DAECF4', 'profile_text_color': '663B12', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'regular'}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 1426, 'favorite_count': 3424, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'is_quote_status': False, 'retweet_count': 1426, 'favorite_count': 0, 'favorited': False, 'retweeted': False, 'lang': 'en'}
t_query = {'statuses': [{'created_at': 'Wed Oct 23 00:14:01 +0000 2019',    'id': 1186797891257397248,    'id_str': '1186797891257397248',    'text': '@shwood I picked the ace because I saw the movie "The Princess Bride"😅',    'truncated': False,    'entities': {'hashtags': [{'text': 'india', 'indices': [95, 101]}, {'text': 'english', 'indices': [119, 127]}, {'text': 'online', 'indices': [181, 188]}, {'text': 'altbajali', 'indices': [195, 205]}, {'text': 'zee5', 'indices': [206, 211]}],     'symbols': [],     'user_mentions': [{'screen_name': 'shwood',       'name': 'Brian Brushwood',       'id': 14645160,       'id_str': '14645160',       'indices': [0, 7]}],     'urls': []},    'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},    'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',    'in_reply_to_status_id': 1186793271571431426,    'in_reply_to_status_id_str': '1186793271571431426',    'in_reply_to_user_id': 14645160,    'in_reply_to_user_id_str': '14645160',    'in_reply_to_screen_name': 'shwood',    'user': {'id': 14481210,     'id_str': '14481210',     'name': 'Theoiv',     'screen_name': 'theoiv',     'location': 'Austin, TX',     'description': 'loving life after the military. loving each day i earned through service to my country while in the US NAVY. Proudly Retired. A husband of 20+ years and a Dad',     'url': None,     'entities': {'description': {'urls': []}},     'protected': False,     'followers_count': 112,     'friends_count': 386,     'listed_count': 3,     'created_at': 'Tue Apr 22 22:01:43 +0000 2008',     'favourites_count': 3152,     'utc_offset': None,     'time_zone': None,     'geo_enabled': True,     'verified': False,     'statuses_count': 926,     'lang': None,     'contributors_enabled': False,     'is_translator': False,     'is_translation_enabled': False,     'profile_background_color': '000000',     'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme5/bg.gif',     'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme5/bg.gif',     'profile_background_tile': False,     'profile_image_url': 'http://pbs.twimg.com/profile_images/1084638901791346688/errTiEjB_normal.jpg',     'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1084638901791346688/errTiEjB_normal.jpg',     'profile_banner_url': 'https://pbs.twimg.com/profile_banners/14481210/1487056356',     'profile_link_color': '3B94D9',     'profile_sidebar_border_color': '000000',     'profile_sidebar_fill_color': '000000',     'profile_text_color': '000000',     'profile_use_background_image': False,     'has_extended_profile': True,     'default_profile': False,     'default_profile_image': False,     'following': None,     'follow_request_sent': None,     'notifications': None,     'translator_type': 'none'},    'geo': None,    'coordinates': None,    'place': None,    'contributors': None,    'is_quote_status': False,    'retweet_count': 0,    'favorite_count': 0,    'favorited': False,    'retweeted': False,    'lang': 'en'},   {'created_at': 'Wed Oct 23 00:13:40 +0000 2019',    'id': 1186797804817006592,    'id_str': '1186797804817006592',    'text': 'Hey, @BuckSexton, Bernie sounds like the bishop in The Princess Bride in his little "If you are prepared..." tirade… https://t.co/uCC9d0UlNG',    'truncated': True,    'entities': {'hashtags': [],     'symbols': [],     'user_mentions': [{'screen_name': 'BuckSexton',       'name': 'Buck Sexton',       'id': 334715818,       'id_str': '334715818',       'indices': [5, 16]}],     'urls': [{'url': 'https://t.co/uCC9d0UlNG',       'expanded_url': 'https://twitter.com/i/web/status/1186797804817006592',       'display_url': 'twitter.com/i/web/status/1…',       'indices': [117, 140]}]},    'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},    'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',    'in_reply_to_status_id': None,    'in_reply_to_status_id_str': None,    'in_reply_to_user_id': None,    'in_reply_to_user_id_str': None,    'in_reply_to_screen_name': None,    'user': {'id': 1482860016,     'id_str': '1482860016',     'name': 'J.J. Sawyer Phillips',     'screen_name': 'JJSawyerPhillip',     'location': '',     'description': "Writing is not an occupation, it's not a job; writing is a calling, an identity, a beautiful madness that you never want to let go.",     'url': None,     'entities': {'description': {'urls': []}},     'protected': False,     'followers_count': 102,     'friends_count': 412,     'listed_count': 7,     'created_at': 'Tue Jun 04 18:01:44 +0000 2013',     'favourites_count': 13840,     'utc_offset': None,     'time_zone': None,     'geo_enabled': False,     'verified': False,     'statuses_count': 3987,     'lang': None,     'contributors_enabled': False,     'is_translator': False,     'is_translation_enabled': False,     'profile_background_color': 'C0DEED',     'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',     'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',     'profile_background_tile': False,     'profile_image_url': 'http://pbs.twimg.com/profile_images/378800000088508694/477f6cee5ccdf9c85b8029aed8882fe6_normal.jpeg',     'profile_image_url_https': 'https://pbs.twimg.com/profile_images/378800000088508694/477f6cee5ccdf9c85b8029aed8882fe6_normal.jpeg',     'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1482860016/1398790435',     'profile_link_color': '1DA1F2',     'profile_sidebar_border_color': 'C0DEED',     'profile_sidebar_fill_color': 'DDEEF6',     'profile_text_color': '333333',     'profile_use_background_image': True,     'has_extended_profile': False,     'default_profile': True,     'default_profile_image': False,     'following': None,     'follow_request_sent': None,     'notifications': None,     'translator_type': 'none'},    'geo': None,    'coordinates': None,    'place': None,    'contributors': None,    'is_quote_status': False,    'retweet_count': 0,    'favorite_count': 0,    'favorited': False,    'retweeted': False,    'lang': 'en'}],  'search_metadata': {'completed_in': 0.016,   'max_id': 1186797891257397248,   'max_id_str': '1186797891257397248',   'next_results': '?max_id=1186797804817006591&q=The%20Princess%20Bride&count=2&include_entities=1',   'query': 'The+Princess+Bride',   'refresh_url': '?since_id=1186797891257397248&q=The%20Princess%20Bride&include_entities=1',   'count': 2,   'since_id': 0,   'since_id_str': '0'}}