# Using Tweepy for Twitter Analysis

http://docs.tweepy.org/en/v3.9.0/getting_started.html

In [None]:
!pip install textblob

In [None]:
!pip install tweepy

In [None]:
!pip install wordcloud

## Bring in the libraries

In [None]:
# import os
# the regulars
import pandas as pd
import numpy as np
import plotly.express as px
import osmnx as ox

# to get tweets
import tweepy as tw

# for sentiment analysis
from textblob import TextBlob
import re

# word clouds
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Twitter with tweepy

In order to use twitter's api, you will need a developer's account. You will then have the ability to generate the tokens needed to use their API.

- http://docs.tweepy.org/en/latest/


In [None]:
# your twitter keys/secrets/tokens
consumer_key= ''
consumer_secret= ''
access_token= ''
access_token_secret= ''

In [None]:
# authenticate thyself with twitter
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

## Tweets by username
- http://docs.tweepy.org/en/latest/api.html#API.user_timeline

In [None]:
# Creation of query method using parameters
tweets = tw.Cursor(api.user_timeline, id='BillGates', tweet_mode='extended').items(150)

In [None]:
for index, tweet in enumerate(tweets):
    print(index, tweet.full_text)

## Tweets by keyword

* search parameters: http://docs.tweepy.org/en/latest/api.html#search-methods

In [None]:
# search query
q = 'covid'

# filter out retweets (optional of course)
q = q + " -filter:retweets"

# how many?
max_tweets = 500
 
# Creation of query method using parameters
tweets = tw.Cursor(api.search,q=q, tweet_mode='extended').items(max_tweets)

In [None]:
for index, tweet in enumerate(tweets):
    print(index, tweet.full_text)

## Tweets by keyword and place

In [None]:
tweets = tw.Cursor(api.search,q=q,geocode='34.068921,-118.4473751,50km', tweet_mode='extended').items(max_tweets)

In [None]:
for index, tweet in enumerate(tweets):
    print(index, tweet.full_text)

# The tweet object

For reasons that I am unable to verify, the tweet object that is returned by the `tw.Cursor` function can only fun a single loop operation before it mysteriously disappears (if anybody can figure this one out, let me know!). For that reason, run the search again.

In [None]:
# search for covid tweets in LA
tweets = tw.Cursor(api.search,q=q,geocode='34.068921,-118.4473751,50km', tweet_mode='extended').items(max_tweets)

The tweet object is in *json* format. We can convert it into a dataframe for easier access:

In [None]:
json_data = [tweet._json for tweet in tweets]
df = pd.json_normalize(json_data)
df.head()

In [None]:
df.info(verbose=True)

That's a lot of columns! Twitter saves a ton of metadata for each tweet... Let's clean this up.

In [None]:
df = df[['created_at','full_text','user.screen_name','user.profile_image_url_https']]

In [None]:
df.columns = ['created_at','text','screen_name','profile_image']

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df.sample(5)

Just for fun, let's convert the profile image url's into actual images. This is somewhat of a hack, and only works with the applied code below (ie, it's not baked into the dataframe).

In [None]:
from IPython.display import Image, HTML

# df = pd.DataFrame(['./image01.png', './image02.png'], columns = ['Image'])

def path_to_image_html(path):
    return '<img src="'+ path + '"/>'

pd.set_option('display.max_colwidth', None)

HTML(df.to_html(escape=False ,formatters=dict(profile_image=path_to_image_html)))

<div class="alert alert-info">
Now it's your turn! Get creative and create your own twitter search queries on matters that interest you.
</div>

## Word Cloud
Word clouds are great to visually display word clusters. The algorithms are simple. More word frequency, larger font size, less frequent words, smaller fonts sizes.

We will use the [word_cloud library](https://github.com/amueller/word_cloud).

First though, we need to clean up the tweets. Tweets are notorious for having strange characters and emoji's!

In [None]:
# function to clean tweets using regular expressions
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet).split())

In [None]:
# an example of cleaning a tweet
tweet = df.sample().text.values[0]
print(tweet)
clean_tweet(tweet)

In [None]:
# create a new column for the clean text
df['clean_text'] = ''

In [None]:
# loop and add the cleaned up text to the new column
for i, row in df.iterrows():
    clean = clean_tweet(row.text)
    df.at[i,'clean_text'] = clean

In [None]:
df.sample(5)[['text','clean_text']]

Great. Every tweet has been cleaned up. In order to create a word cloud, we need to create a single variable that has every word in every tweet from our twitter dataframe. We then feed that to the world cloud factory that will generate the word cloud for us.

In [None]:
# now put every word into a single variable
all_text = ' '.join(df['clean_text'])
all_text

In [None]:
# create the word cloud
wordcloud = WordCloud(width=1200, 
                      height=800,
                      background_color="white").generate(all_text)

# Display the WordCloud                    
plt.figure(figsize=(12,12))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Sentiment Analysis

For sentiment analysis, we will use the [textblob](https://textblob.readthedocs.io/en/dev/) python library.

The sentiment property returns a tuple of the form `Sentiment(polarity, subjectivity)`. The polarity score ranges from -1 (most negative) to +1 (most positive). The subjectivity ranges from 0 to 1, where 0.0 is very objective and 1.0 is very subjective.

Let's test this out on a random tweet.

In [None]:
# get a random tweet
tweet = df.sample().clean_text.values[0]
print(tweet)

# analyze the tweet
a = TextBlob(tweet)

# results
a.sentiment

In [None]:
# create an new (empty) column for polarity
df['polarity']=''

In [None]:
# loop through every row and add the polarity value in our new column
for i, row in df.iterrows():
    a = TextBlob(row.text)
    df.at[i,'polarity'] = a.polarity

In [None]:
df[['clean_text','polarity']].sample(5)

Let's quantify the results. Tweets are either positive, neutral, or negative, so let's give them categorical values.

Numpy has a convenient function `.select` that allows you to generate a categorical ranking based on conditional arguments on a given column. In other words, we can assign tweets to be "positive" or "negative" based on their polarity values.

In [None]:
# create a list of our conditions
conditions = [
    (df['polarity'] < 0), # negative
    (df['polarity'] == 0), # neutral
    (df['polarity'] > 0) # positive
    ]

# create a list of the values we want to assign for each condition
values = [
    'negative', 
    'neutral', 
    'positive'
    ]

# create a new column and use np.select to assign values to it using our lists as arguments
df['sentiment'] = np.select(conditions, values)

In [None]:
# display updated DataFrame
df.sample(5)[['clean_text','polarity','sentiment']]

In [None]:
fig = px.pie(df, 
             names='sentiment',
             width=600,
             title='Sentiment analysis for '+q,
             color='sentiment',
             color_discrete_map={'positive':'#91cf60','neutral':'#ffffbf','negative':'#d73027'}
            )
fig.update_traces(textinfo='value')
fig.show()

In [None]:
num_bins = 50
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(df.polarity, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();

# Let's make a function!

In [None]:
def find_tweets(q,place,distance='50km',count=500):
    
    #
    # geocode the place to get coordinates
    #
    
    g = ox.geocoder.geocode(place)
    
    # concatenate the results
    geocode = '"'+str(g[0])+','+str(g[1])+','+distance+'"'
    
    #
    # grab the tweets
    #
    
    tweets = tw.Cursor(api.search,
                       q=q+' -filter:retweets', # no retweets
                       geocode=geocode, 
                       tweet_mode='extended').items(count)
    #
    # create a dataframe
    #
    
    json_data = [tweet._json for tweet in tweets]
    df = pd.json_normalize(json_data)

    # clean it up
    df = df[['created_at','full_text']]

    # clean the text
    df['clean_text'] = ''
    
    for i, row in df.iterrows():
        clean = clean_tweet(row.full_text)
        df.at[i,'clean_text'] = clean

    #
    # word cloud
    #
    
    all_text = ' '.join(df['clean_text'])
    
    # create the word cloud
    wordcloud = WordCloud(width=1200, 
                          height=800,
                          background_color="white").generate(all_text)

    # Display the WordCloud                    
    plt.figure(figsize=(12,12))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

    #
    # sentiment analysis
    #
    
    df['polarity']=''
    
    for i, row in df.iterrows():
        a = TextBlob(row.full_text)
        df.at[i,'polarity'] = a.polarity
    
    # create a list of our conditions
    
    conditions = [
        (df['polarity'] < 0), # negative
        (df['polarity'] == 0), # neutral
        (df['polarity'] > 0) # positive
        ]

    # create a list of the values we want to assign for each condition
    values = [
        'negative', 
        'neutral', 
        'positive'
        ]

    # create a new column and use np.select to assign values to it using our lists as arguments
    df['sentiment'] = np.select(conditions, values)

    fig = px.pie(df, 
                 names='sentiment',
                 width=600,
                 title='Sentiment analysis for '+q,
                 color='sentiment',
                 color_discrete_map={'positive':'#91cf60','neutral':'#ffffbf','negative':'#d73027'}
                )
    fig.update_traces(textinfo='value')
    fig.show()
    
    num_bins = 50
    plt.figure(figsize=(10,6))
    n, bins, patches = plt.hist(df.polarity, num_bins, facecolor='blue', alpha=0.5)
    plt.xlabel('Polarity')
    plt.ylabel('Count')
    plt.title('Average polarity: '+str(df.polarity.mean()))
    plt.show();
    return df.sample(5)[['clean_text','polarity']]

In [None]:
find_tweets(q='trump',place='90095')

In [None]:
find_tweets(q='biden',place='90095')

## By user

In [None]:
def find_tweets_user(u,count=500):
    
    #
    # grab the tweets
    #
    tweets = tw.Cursor(api.user_timeline, 
                       id=u, 
                       tweet_mode='extended').items(count)
    
    #
    # create a dataframe
    #
    
    json_data = [tweet._json for tweet in tweets]
    df = pd.json_normalize(json_data)

    # clean it up
    df = df[['created_at','full_text']]

    # clean the text
    df['clean_text'] = ''
    
    for i, row in df.iterrows():
        clean = clean_tweet(row.full_text)
        df.at[i,'clean_text'] = clean

    #
    # word cloud
    #
    
    all_text = ' '.join(df['clean_text'])
    
    # create the word cloud
    wordcloud = WordCloud(width=1200, 
                          height=800,
                          background_color="white").generate(all_text)

    # Display the WordCloud                    
    plt.figure(figsize=(12,12))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

    #
    # sentiment analysis
    #
    
    df['polarity']=''
    
    for i, row in df.iterrows():
        a = TextBlob(row.full_text)
        df.at[i,'polarity'] = a.polarity
    
    # create a list of our conditions
    
    conditions = [
        (df['polarity'] < 0), # negative
        (df['polarity'] == 0), # neutral
        (df['polarity'] > 0) # positive
        ]

    # create a list of the values we want to assign for each condition
    values = [
        'negative', 
        'neutral', 
        'positive'
        ]

    # create a new column and use np.select to assign values to it using our lists as arguments
    df['sentiment'] = np.select(conditions, values)

    fig = px.pie(df, 
                 names='sentiment',
                 width=600,
                 title='Sentiment analysis for '+q,
                 color='sentiment',
                 color_discrete_map={'positive':'#91cf60','neutral':'#ffffbf','negative':'#d73027'}
                )
    fig.update_traces(textinfo='value')
    fig.show()
    
    num_bins = 50
    plt.figure(figsize=(10,6))
    n, bins, patches = plt.hist(df.polarity, num_bins, facecolor='blue', alpha=0.5)
    plt.xlabel('Polarity')
    plt.ylabel('Count')
    plt.title('Average polarity: '+str(df.polarity.mean()))
    plt.show();
    return df.sample(5)[['clean_text','polarity']]

In [None]:
find_tweets_user(u='realDonaldTrump')

In [None]:
find_tweets_user(u='JoeBiden')