### Twitter API Scraping and Data Wrangling Case Study
Timothy Short

In [299]:
import pandas as pd
import requests
from io import StringIO
import json
import tweepy #pip install tweepy
import time
import matplotlib.pyplot as plt
%matplotlib inline

#### Step 1: Load the datasets

In [254]:
#load archived tweets
df_tweets = pd.read_csv('twitter-archive-enhanced.csv')
print(df_tweets.shape[0])
df_tweets.head(1)

2356


Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


In [255]:
#download dataset for images
images_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
images = requests.get(images_url)
with open('image_predictions.tsv', mode='wb') as file:
    file.write(images.content)
df_predictions = pd.read_csv('image_predictions.tsv', delimiter='\t')

<hr>
#### Step 2: Investigate and Observe the Tweets Dataset

In [256]:
df_tweets.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [257]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [258]:
df_tweets[df_tweets['rating_denominator']!=10].shape[0]

23

In [266]:
df_tweets[df_tweets['rating_numerator'] > 15]['rating_numerator'].value_counts()

420     2
75      2
60      1
27      1
99      1
165     1
80      1
144     1
44      1
204     1
45      1
88      1
143     1
1776    1
17      1
50      1
26      1
84      1
182     1
20      1
24      1
121     1
666     1
960     1
Name: rating_numerator, dtype: int64

In [53]:
df_tweets[df_tweets['name'] == 'None'].shape[0]

745

In [13]:
df_tweets.query(
    'doggo == "None"'  and 'floofer == "None"' and 'pupper == "None"' and 'puppo == "None"'
        ).shape[0]

2326

In [237]:
#Check for different values of dog stages
print(df_tweets[df_tweets['doggo'] != "None"]['doggo'].value_counts())
print(df_tweets[df_tweets['floofer'] != "None"]['floofer'].value_counts())
print(df_tweets[df_tweets['pupper'] != "None"]['pupper'].value_counts())
print(df_tweets[df_tweets['puppo'] != "None"]['puppo'].value_counts())

doggo    97
Name: doggo, dtype: int64
floofer    10
Name: floofer, dtype: int64
pupper    257
Name: pupper, dtype: int64
puppo    30
Name: puppo, dtype: int64


**Observations**
- 181 of the tweets were "retweeted" and 78 of the tweets were "replies"
- `tweet_id` is as an Integer; should be a String
- `timestamp` field is a String (not in DateTime), same is true for `retweeted_status_timestamp`
- `expanded_urls` (the URL of the photo) is missing in 59 tweets
- `text` field contains multiple data points (actual text, rating, url, and dog name)
- `rating_denominator` has 23 records with a value other than 10
- `rating_numerator` has widely ranging values
- `name` has 745 records with 'None' as the dog name
- There are 2326 records where the dog stage was not identified (as `doggo, floofer, pupper, puppo`)

In [14]:
df_predictions.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


<hr>
#### Step 3: Clean the Tweets dataset

In [285]:
#copy the tweets dataframe
df_tweets_copy = df_tweets.copy()

In [286]:
#remove retweets
df_tweets_copy = df_tweets_copy[df_tweets_copy['retweeted_status_id'].isnull()]

#remove replies
df_tweets_copy = df_tweets_copy[df_tweets_copy['in_reply_to_status_id'].isnull()]

#remove tweets with no photos
df_tweets_copy = df_tweets_copy[df_tweets_copy['expanded_urls'].notnull()]

df_tweets_copy.reset_index(inplace=True)

In [287]:
#clean extracted data - rating
#look for denominators other than 10
pd.options.display.max_colwidth = 150
df_tweets_copy[df_tweets_copy['rating_denominator'] != 10][['text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,text,rating_numerator,rating_denominator
338,The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd,84,70
402,Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t ...,24,7
698,Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE,165,150
851,"After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ",9,11
902,Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv,204,170
946,Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a,4,20
983,This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq,50,50
1009,Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1,99,90
1034,Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12,80,80
1054,"From left to right:\nCletus, Jerome, Alejandro, Burp, &amp; Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK",45,50


In [288]:
#manually update errors - based on denominator
df_tweets_copy.set_value(851, 'rating_numerator', 9);
df_tweets_copy.set_value(946, 'rating_numerator', 13);
df_tweets_copy.set_value(1423, 'rating_numerator', 10);
df_tweets_copy.set_value(2073, 'rating_numerator', 9);
df_tweets_copy.set_value([851, 946, 1423, 2073], 'rating_denominator', 10);

df_tweets_copy.drop(402, inplace=True)
df_tweets_copy.reset_index()

#update ratings for photos of multiple dogs - look for deniminators greater than 10 in multiples of 10
for row in df_tweets_copy[df_tweets_copy['rating_denominator'] != 10][['text', 'rating_numerator', 'rating_denominator']].itertuples():
    divisor = row.rating_denominator / 10
    new_rating = row.rating_numerator / divisor
    index = row[0]
    df_tweets_copy.set_value(index, 'rating_numerator', new_rating)
    df_tweets_copy.set_value(index, 'rating_denominator', 10)

In [289]:
#look for large denominators - greater than 15
df_tweets_copy[df_tweets_copy['rating_numerator'] > 15][['text', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,text,rating_numerator,rating_denominator
527,"This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS",75,10
584,This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 would smile back https://...,27,10
768,This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh,1776,10
1471,Here we have uncovered an entire battalion of holiday puppers. Average of 11.26/10 https://t.co/eNm2S6p9BD,26,10
1818,After so many requests... here you go.\n\nGood dogg. 420/10 https://t.co/yfAAo1gdeY,420,10


In [290]:
#manually update errors
df_tweets_copy.set_value(527, 'rating_numerator', 9.75);
df_tweets_copy.set_value(584, 'rating_numerator', 11.27);
df_tweets_copy.set_value(1471, 'rating_numerator', 11.26);

#remove errors
df_tweets_copy.drop(768, inplace=True)
df_tweets_copy.drop(1818, inplace=True)
df_tweets_copy.reset_index();

<hr>
#### Step 4: Download Twitter Data through API and Merge with Tweet Dataset

In [292]:
#twitter API
consumer_key = ''
consumer_secret = ''
access_token = ''
access_secret = ''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

In [312]:
#read tweets via tweepy API
data = {}  
data['tweets'] = []
start = time.time()
    
for twitters in df_tweets_copy.iloc[:].itertuples():
    try:
        tweet = api.get_status(twitters.tweet_id, tweet_mode='extended')
        retweets = tweet.retweet_count
        favorites = tweet.favorite_count
    except Exception:
        tweet = twitters.tweet_id
        retweets = float('NaN')
        favorites = float('NaN')
    
    data['tweets'].append({'tweet_id' : str(twitters.tweet_id),
                     'retweets' : str(retweets),
                     'favorites' : str(favorites)
                      })

end = time.time()
print('elapsed %f' %(end - start))
    
#write json info to tweet_json.txt
with open('tweet_json.txt', 'w') as outfile:  
    json.dump(data, outfile)

elapsed 650.580002


In [313]:
#read json file
tweet_info = []
with open('tweet_json.txt') as json_file:  
    data = json.load(json_file)
    for p in data['tweets']:
        tweet_info.append(p)

df_twitter_api = pd.DataFrame(tweet_info)

In [322]:
print(df_twitter_api.shape[0])
df_twitter_api.head(1)

2356


Unnamed: 0,favorites,retweets,tweet_id
0,38940,8628,892420643555336193


In [321]:
print(df_tweets_copy.shape[0])
df_tweets_copy.head(1)

2091


Unnamed: 0,index,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU,,,,https://twitter.com/dog_rates/status/892420643555336193/photo/1,13,10,Phineas,,,,


In [319]:
print(df_predictions.shape[0])
df_predictions.head(1)

2075


Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True


In [None]:
#merge dataframes

In [None]:
#export to twitter_archive_master.csv