In [1]:
import pandas as pd
import requests
import tweepy
import json
import os

# Gathering

### Twitter archive

In [13]:
df_twitter_arc = pd.read_csv('data/twitter-archive-enhanced.csv')
df_twitter_arc.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
268,841439858740625411,,,2017-03-14 00:04:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have some incredible doggos for #K9Vet...,,,,https://twitter.com/dog_rates/status/841439858...,14,10,,,,,
1461,694925794720792577,,,2016-02-03 16:49:55 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",Please only send in dogs. This t-rex is very s...,,,,https://vine.co/v/iJvUqWQ166L,5,10,,,,,


### Image prediction

In [15]:
df_prediction = None

r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

if r.status_code is 200:
    df_prediction = pd.read_csv(pd.compat.StringIO(r.text), sep='\t')    
else:    
    print('ERROR: Image prediction request returned {status_code} status code.'.format(status_code = r.status_code))

In [16]:
df_prediction.sample(2)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1299,752519690950500352,https://pbs.twimg.com/media/CnF8qVDWYAAh0g1.jpg,3,swing,0.999984,False,Labrador_retriever,1e-05,True,Eskimo_dog,1e-06,True
1895,850019790995546112,https://pbs.twimg.com/media/C8vgfTsXgAA561h.jpg,3,Shetland_sheepdog,0.759907,True,collie,0.107405,True,Pembroke,0.052335,True


## Twitter API

In [8]:
# Twitter APP Config
with open('twitter-config.json', 'r', encoding='utf-8') as file:
    app_config = json.load(file)

In [10]:
# Twitter API settings
api_key = app_config['api_key']
api_secret = app_config['api_secret']
access_token = app_config['access_token']
access_secret = app_config['access_secret']

In [11]:
# Connect to Twitter API
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [40]:
json_path = 'data/tweet_json.txt'
error_log = 'data/tweet_error.log'

# Save tweets from Twitter API
if(not os.path.isfile(json_path)):
    for tweet_id in df_twitter_arc.tweet_id:
        try:
            status = api.get_status(tweet_id)

            with open(json_path, 'a', newline='\n') as file:
                file.write(f'{json.dumps(status._json)}\n')

        except Exception as err:
            with open(error_log, 'a', newline='\n') as log:
                log.write(f'{str(tweet_id)}: {str(err.args[0])}\n')

            print(f'{str(tweet_id)}: {str(err.args[0])}')
        

888202515573088257: [{'code': 144, 'message': 'No status found with that ID.'}]

873697596434513921: [{'code': 144, 'message': 'No status found with that ID.'}]

872668790621863937: [{'code': 144, 'message': 'No status found with that ID.'}]

872261713294495745: [{'code': 144, 'message': 'No status found with that ID.'}]

869988702071779329: [{'code': 144, 'message': 'No status found with that ID.'}]

866816280283807744: [{'code': 144, 'message': 'No status found with that ID.'}]

861769973181624320: [{'code': 144, 'message': 'No status found with that ID.'}]

845459076796616705: [{'code': 144, 'message': 'No status found with that ID.'}]

842892208864923648: [{'code': 144, 'message': 'No status found with that ID.'}]

837012587749474308: [{'code': 144, 'message': 'No status found with that ID.'}]

827228250799742977: [{'code': 144, 'message': 'No status found with that ID.'}]

812747805718642688: [{'code': 144, 'message': 'No status found with that ID.'}]

802247111496568832: [{'code'

Rate limit reached. Sleeping for: 619


693095443459342336: Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')",))

680055455951884288: [{'code': 144, 'message': 'No status found with that ID.'}]



# Asses

Quality dimensions:
1. Completeness
2. Validity
3. Accuracy
4. Consistency

### Assess: Twitter data archive

In [41]:
 df_twitter_arccwitter_arcwitter_arcwitter_arc.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2041,671542985629241344,,,2015-12-01 04:14:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...","This is JD (stands for ""just dog""). He's like ...",,,,https://twitter.com/dog_rates/status/671542985...,10,10,JD,,,,
1018,746818907684614144,6.914169e+17,4196984000.0,2016-06-25 21:34:37 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Guys... Dog Jesus 2.0\n13/10 buoyant af https:...,,,,https://twitter.com/dog_rates/status/746818907...,13,10,,,,,
540,806542213899489280,,,2016-12-07 16:53:43 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Waffles. He's concerned that the dandr...,,,,https://twitter.com/dog_rates/status/806542213...,11,10,Waffles,,,,
1409,699060279947165696,,,2016-02-15 02:38:53 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This is Yukon. He pukes rainbows. 12/10 magica...,,,,https://vine.co/v/inlmMHxtqDD,12,10,Yukon,,,,
998,748307329658011649,,,2016-06-30 00:09:04 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This sherk must've leapt out of the water and ...,,,,https://twitter.com/dog_rates/status/748307329...,7,10,,,,,
966,750383411068534784,,,2016-07-05 17:38:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoe. She was trying to stealthily take...,,,,https://twitter.com/dog_rates/status/750383411...,9,10,Zoe,,,pupper,
1664,682788441537560576,,,2016-01-01 05:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy New Year from your fav holiday squad! 🎉 ...,,,,https://twitter.com/dog_rates/status/682788441...,12,10,,,,pupper,
1707,680801747103793152,,,2015-12-26 17:25:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Great picture here. Dog on the right panicked ...,,,,https://twitter.com/dog_rates/status/680801747...,10,10,,,,,
787,774639387460112384,,,2016-09-10 16:03:16 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Sprinkles. He's trapped in light jail....,,,,https://twitter.com/dog_rates/status/774639387...,10,10,Sprinkles,,,,
1532,690005060500217858,,,2016-01-21 02:56:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...","""I'm the only one that ever does anything in t...",,,,https://twitter.com/dog_rates/status/690005060...,10,10,,,,,


In [42]:
df_twitter_arc.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [43]:
df_twitter_arc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

### Quality
#### 

# Clean

# Store

# Reports

* Data wrangling efforts
* Analyses and visualizations