## Create Twitter Dataset
Query Twitter for terms of interest and download tweets for further analysis using NLP tools (AWS Comprehend).

In [1]:
import tweepy #https://github.com/tweepy/tweepy
import pandas as pd

#Twitter API credentials
consumer_key = "insert_your_key"
consumer_secret = "insert_your_key"
access_key = "insert_your_key"
access_secret = "insert_your_key"


In [2]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

Get tweets that mention Kroger and are not retweets.

In [3]:
# This method allows us to query last 7 days of tweets

query = "kroger -filter:retweets"

# Initiate the list
tweets = []
for tweet in tweepy.Cursor(api.search, 
                           q=query,
                           count=100,
                           lang="en",
                           tweet_mode="extended").items(10000):
    tweets.append(tweet._json)


In [4]:
# number of tweets
# sometimes there are fewer than 10K tweets on certain topic
len(tweets)

10000

In [5]:
# Sample tweet
tweets[100]['full_text']

'Who cheaper Kroger or Walmart?'

Convert JSON file into a dataframe of select tweet fields.

In [6]:
tweet_df = pd.DataFrame()

for i in range(len (tweets)):
    retweet_name = ''
    try:
        x = tweets[i]['retweeted_status']['full_text']
        retweet = True
        retweet_name = tweets[i]['retweeted_status']['user']['screen_name']
            
    except KeyError:
        x = tweets[i]['full_text']
        retweet = False
        retweet_name = ''
    df = pd.DataFrame({'user': tweets[i]['user']['screen_name'],
                       'id_str': tweets[i]['id_str'],
                       'created_at': tweets[i]['created_at'],
                       'source': tweets[i]['source'],
                       'in_reply_to_screen_name': tweets[i]['in_reply_to_screen_name'],
                       'full_text':x,
                       'rt': retweet,
                       'rt_name': retweet_name}, index = [i])
    tweet_df = tweet_df.append(df)

tweet_df.created_at = pd.to_datetime(tweet_df.created_at) - pd.Timedelta(5, unit = 'h') #GMT +5 timezone
tweet_df.id_str = 'id = '+tweet_df.id_str # helps with CSV to Excel transition

# Amazon Comrehend prep
# remove line breaks in tweets
tweet_df.full_text = tweet_df.full_text.str.replace('\n', ' ')
tweet_df.full_text = tweet_df.full_text.str.replace('\r', ' ')

Export to CSV

In [7]:
path = '../NLP/Comprehend/'
name = 'kroger_tweets.csv'
output_name = path + name

tweet_df['full_text'].to_csv(output_name, encoding = 'utf-8', index = False, header = False)

In [8]:
output_name

'../NLP/Comprehend/kroger_tweets.csv'