### Twitter API Scraping and Data Wrangling Case Study
Timothy Short

In [None]:
import pandas as pd
import requests
import json
import tweepy #pip install tweepy
import time
import matplotlib.pyplot as plt
%matplotlib inline

#### Step 1: Load the datasets

In [None]:
#load archived tweets
df_tweets = pd.read_csv('twitter-archive-enhanced.csv')
print(df_tweets.shape[0])
df_tweets.head(1)

In [None]:
#download dataset for images
images_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
images = requests.get(images_url)
with open('image_predictions.tsv', mode='wb') as file:
    file.write(images.content)
df_predictions = pd.read_csv('image_predictions.tsv', delimiter='\t')
df_predictions.head(1)

<hr>
#### Step 2: Investigate and Observe the Tweets Dataset

In [None]:
df_tweets.head()

In [None]:
df_tweets.info()

In [None]:
df_tweets[df_tweets['rating_denominator']!=10].shape[0]

In [None]:
df_tweets[df_tweets['rating_numerator'] > 15]['rating_numerator'].value_counts()

In [None]:
df_tweets[df_tweets['name'] == 'None'].shape[0]

In [None]:
df_tweets.query(
    'doggo == "None"'  and 'floofer == "None"' and 'pupper == "None"' and 'puppo == "None"'
        ).shape[0]

In [None]:
#Check for different values of dog stages
print(df_tweets[df_tweets['doggo'] != "None"]['doggo'].value_counts())
print(df_tweets[df_tweets['floofer'] != "None"]['floofer'].value_counts())
print(df_tweets[df_tweets['pupper'] != "None"]['pupper'].value_counts())
print(df_tweets[df_tweets['puppo'] != "None"]['puppo'].value_counts())

**Observations**
- 181 of the tweets were "retweeted" and 78 of the tweets were "replies"
- `tweet_id` is as an Integer; should be a String
- `timestamp` field is a String (not in DateTime), same is true for `retweeted_status_timestamp`
- `expanded_urls` (the URL of the photo) is missing in 59 tweets
- `text` field contains multiple data points (actual text, rating, url, and dog name)
- `rating_denominator` has 23 records with a value other than 10
- `rating_numerator` has widely ranging values
- `name` has 745 records with 'None' as the dog name
- There are 2326 records where the dog stage was not identified (as `doggo, floofer, pupper, puppo`)

<hr>
#### Step 3: Clean the Tweets dataset

In [None]:
#copy the tweets dataframe
df_tweets_copy = df_tweets.copy()

In [None]:
#change tweet_id to String
df_tweets_copy['tweet_id'] = df_tweets_copy['tweet_id'].astype(str)

#remove retweets
df_tweets_copy = df_tweets_copy[df_tweets_copy['retweeted_status_id'].isnull()]

#remove replies
df_tweets_copy = df_tweets_copy[df_tweets_copy['in_reply_to_status_id'].isnull()]

#remove tweets with no photos
df_tweets_copy = df_tweets_copy[df_tweets_copy['expanded_urls'].notnull()]

#change timestampe to DateTime
df_tweets_copy['timestamp'] = pd.to_datetime(df_tweets_copy['timestamp'])

df_tweets_copy.reset_index(inplace=True)

In [None]:
#clean extracted data - rating
#look for denominators other than 10
pd.options.display.max_colwidth = 150
df_tweets_copy[df_tweets_copy['rating_denominator'] != 10][['text', 'rating_numerator', 'rating_denominator']]

In [None]:
#manually update errors - based on denominator
df_tweets_copy.set_value(851, 'rating_numerator', 9);
df_tweets_copy.set_value(946, 'rating_numerator', 13);
df_tweets_copy.set_value(1423, 'rating_numerator', 10);
df_tweets_copy.set_value(2073, 'rating_numerator', 9);
df_tweets_copy.set_value([851, 946, 1423, 2073], 'rating_denominator', 10);

df_tweets_copy.drop(402, inplace=True)
df_tweets_copy.reset_index()

#update ratings for photos of multiple dogs - look for deniminators greater than 10 in multiples of 10
for row in df_tweets_copy[df_tweets_copy['rating_denominator'] != 10][['text', 'rating_numerator', 'rating_denominator']].itertuples():
    divisor = row.rating_denominator / 10
    new_rating = row.rating_numerator / divisor
    index = row[0]
    df_tweets_copy.set_value(index, 'rating_numerator', new_rating)
    df_tweets_copy.set_value(index, 'rating_denominator', 10)

In [None]:
#look for large denominators - greater than 15
df_tweets_copy[df_tweets_copy['rating_numerator'] > 15][['text', 'rating_numerator', 'rating_denominator']]

In [None]:
#manually update errors
df_tweets_copy.set_value(527, 'rating_numerator', 9.75);
df_tweets_copy.set_value(584, 'rating_numerator', 11.27);
df_tweets_copy.set_value(1471, 'rating_numerator', 11.26);

#remove errors
df_tweets_copy.drop(768, inplace=True)
df_tweets_copy.drop(1818, inplace=True)
df_tweets_copy.reset_index();

<hr>
#### Step 4: Download Twitter Data through API and Merge with Tweet Dataset
Download data from Twitter using Tweepy API

In [None]:
#twitter API
f_keys = open('keys.txt', 'r')

keys = {}
keys['consumer_key'] = f_keys.readline().rstrip()
keys['consumer_secret'] = f_keys.readline().rstrip()
keys['access_token'] = f_keys.readline().rstrip()
keys['access_secret'] = f_keys.readline().rstrip()

f_keys.close()

auth = tweepy.OAuthHandler(keys['consumer_key'], keys['consumer_secret'])
auth.set_access_token(keys['access_token'], keys['access_secret'])

api = tweepy.API(auth)

In [None]:
#read tweets via tweepy API
data = {}  
data['tweets'] = []
start = time.time()   
#for twitters in df_tweets_copy.iloc[0:5].itertuples(): for testing
for twitters in df_tweets_copy.iloc[:].itertuples():
    try:
        tweet = api.get_status(twitters.tweet_id)
        retweets = tweet.retweet_count
        favorites = tweet.favorite_count
    except Exception:
        tweet = twitters.tweet_id
        retweets = float('NaN')
        favorites = float('NaN')
    
    data['tweets'].append({
                    'retweets' : retweets,
                    'favorites' : favorites,
                    'tweet_id' : twitters.tweet_id
                      })

end = time.time()
print('elapsed %f' %(end - start))
    
#write json info to tweet_json.txt
with open('tweet_json.txt', 'w') as outfile:  
    json.dump(data, outfile)

In [None]:
#read json file
tweet_info = []
with open('tweet_json.txt') as json_file:  
    data = json.load(json_file)
    for p in data['tweets']:
        tweet_info.append(p)

df_twitter_api = pd.DataFrame(tweet_info)

Merge dataframes from Twitter archives, Tweepy API downloads, and predicted images

In [None]:
df_twitter_api.info();
df_twitter_api.head(1);

In [None]:
df_tweets_copy.info()
df_tweets_copy.head(1);

In [None]:
df_predictions['tweet_id'] = df_predictions['tweet_id'].astype(str)
df_predictions.info()
df_predictions.head(1);

In [None]:
#merge dataframes
df_master = df_twitter_api.merge(df_tweets_copy, on='tweet_id').merge(df_predictions, on='tweet_id')
df_master['timestamp'] = pd.to_datetime(df_master['timestamp'])
df_master.info()

In [None]:
#export to twitter_archive_master.csv
df_master.to_csv('twitter_archive_master.csv')

<hr>
#### Step 5: Analyze Data & Visualizations

In [None]:
df_master = pd.read_csv('twitter_archive_master.csv')

In [None]:
df_master['timestamp'] = pd.to_datetime(df_master['timestamp'])

In [None]:
plt.hist(df_master['favorites'], range=[0,50000], bins=20);
plt.title('Distribution of Tweets "Favorited"');
plt.xlabel('Favorites');

In [None]:
plt.hist(df_master['retweets'], range=[0,25000], bins=20);
plt.title('Distribution of Tweets "Retweeted"');
plt.xlabel('Retweets');

In [None]:
plt.hist(df_master['rating_numerator']);
plt.title('Distribution of Ratings');
plt.xlabel('Ratings');

In [None]:
plt.scatter(df_master['favorites'], df_master['rating_numerator'], alpha=.2)
plt.title('Rating vs Favorites')
plt.ylabel('Rating');
plt.xlabel('Favorites');

In [None]:
xlabels = ('doggo', 'floofer', 'pupper', 'puppo')
heights = [df_master[df_master['doggo'] == "doggo"]['rating_numerator'].mean(),
          df_master[df_master['floofer'] == "floofer"]['rating_numerator'].mean(),
          df_master[df_master['pupper'] == "pupper"]['rating_numerator'].mean(),
          df_master[df_master['puppo'] == "puppo"]['rating_numerator'].mean(),
          ]
plt.bar([1,2,3,4], heights, tick_label=xlabels);
plt.title('Average Rating by Dog Stage');

In [None]:
plt.hist(df_master['timestamp'].dt.hour, bins=24);
plt.xlabel('Hour (24)');
plt.ylabel('Number of Tweets');
plt.xticks((0,3,7,11,15,19,23));
plt.title('Distribution of Tweets Across Time of Day');

In [None]:
plt.scatter(df_master['timestamp'].dt.hour, df_master['favorites'], alpha=.2);
plt.xlabel('Hour (24)');
plt.ylabel('Number of "Favorites"');
plt.xticks((0,3,7,11,15,19,23));
plt.title('Distribution of "Favorited" Tweets Across Time of Day');

In [None]:
xlabels = ('first (p1)', 'second (p2)', 'third (p3)', 'none')
heights = [df_master[df_master['p1_dog'] == True]['rating_numerator'].mean(),
          df_master[df_master['p2_dog'] == True]['rating_numerator'].mean(),
          df_master[df_master['p3_dog'] == True]['rating_numerator'].mean(),
          df_master.query('p1_dog == False' and 'p2_dog == False' and 'p3_dog == False')['rating_numerator'].mean(),
          ]
plt.bar([1,2,3,4], heights, tick_label=xlabels);
plt.title('Rating by Algorithm Effectiveness');
plt.xlabel('Algorithm Succesfully Picked');

In [None]:
plt.scatter(df_master['p1_conf'], df_master['rating_numerator'], alpha=.1, s=30);
plt.title('Rating by Predictive Algorithm\n(Based on confidence of p1)');
plt.xlabel('Confidence of Algorithm (p1)');
plt.ylabel('Rating');

In [None]:
#determine the effectiveness of the algorithm
accuracy = []
for i in range(10,100, 10):
    df_sample = df_master.query('p1_conf < ' + str(i/100))
    score = (1 - df_sample['p1_dog'].value_counts()[0] / df_sample['p1_dog'].shape[0])
    accuracy.append(score)
    print('Accuracy at %s%% ' %i, score)

plt.scatter((.1,.2,.3,.4,.5,.6,.7,.8,.9), 100*numpy.array(accuracy));
plt.title('Accuracy of Algorithm');
plt.xlabel('Confidence of Algorithm (p1)');
plt.ylabel('Accuracy Percentage');