# Import Modules
---

In [1]:
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import os

from tqdm.notebook import tqdm

# I. Gathering Data 
---

In [2]:
data_dir = "./data"

### a. WeRateDogs Twitter Archive

In [3]:
wrd_data_filepath = 'twitter-archive-enhanced.csv'

df_wrd_twitter_old = pd.read_csv(os.path.join(data_dir, wrd_data_filepath))

### b. Tweet Image Predictions

In [4]:
image_predictions_filepath = os.path.join(data_dir, 'image-predictions.tsv')

In [5]:
'''
Uncomment the following code to download data if file(image-predictions.tsv) does not exist
'''

# url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

# response = requests.get(url, allow_redirects=True)

# with open(image_predictions_filepath, 'wb') as f:
#     f.write(response.content)

'\nUncomment the following code to download data if file(image-predictions.tsv) does not exist\n'

In [6]:
df_image_predictions_old = pd.read_csv(image_predictions_filepath, sep="\t")

### c. Any Additional Data

In [7]:
json_filepath = os.path.join(data_dir, "tweet_json.txt")

In [8]:
'''
Uncomment the following code to download data if file(tweet_json.txt) does not exist
'''

# with open('twitter-credential.json') as f:
#     credentials = json.load(f)

'\nUncomment the following code to download data if file(tweet_json.txt) does not exist\n'

In [9]:
'''
Uncomment the following code to download data if file(tweet_json.txt) does not exist
'''

# auth = tweepy.OAuthHandler(credentials['consumer_key'], credentials['consumer_secret'])

# auth.set_access_token(credentials['access_token'], credentials['access_token_secret'])

# api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

'\nUncomment the following code to download data if file(tweet_json.txt) does not exist\n'

In [10]:
'''
Uncomment the following code to download data if file(tweet_json.txt) does not exist
'''

# tweet_ids = df_wrd_twitter_old['tweet_id']

# statuses = []

# for tweet_id in tqdm(tweet_ids):

#     try:

#         status = api.get_status(tweet_id, tweet_mode='extended')

#         statuses.append(status._json)

#     except:

#         continue

'\nUncomment the following code to download data if file(tweet_json.txt) does not exist\n'

In [11]:
'''
Uncomment the following code to download data if file(tweet_json.txt) does not exist
'''

# with open(json_filepath, 'w') as outfile:
#     for status in statuses:
#         json.dump(status, outfile)
#         outfile.write("\n")

'\nUncomment the following code to download data if file(tweet_json.txt) does not exist\n'

In [12]:
df_tweet_json_old = pd.read_json(json_filepath, lines=True)

# II. Assessing Data
---

In [13]:
df_wrd_twitter_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [14]:
df_wrd_twitter_old['rating_denominator'].unique()

array([ 10,   0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40,
       130, 110,  16, 120,   2])

In [15]:
df_wrd_twitter_old.query("rating_denominator == 0")['rating_numerator'].unique()

array([960])

In [16]:
df_wrd_twitter_old.query("rating_denominator == 0").index

Int64Index([313], dtype='int64')

In [17]:
df_wrd_twitter_old['name'].unique()

array(['Phineas', 'Tilly', 'Archie', 'Darla', 'Franklin', 'None', 'Jax',
       'Zoey', 'Cassie', 'Koda', 'Bruno', 'Ted', 'Stuart', 'Oliver',
       'Jim', 'Zeke', 'Ralphus', 'Canela', 'Gerald', 'Jeffrey', 'such',
       'Maya', 'Mingus', 'Derek', 'Roscoe', 'Waffles', 'Jimbo', 'Maisey',
       'Lilly', 'Earl', 'Lola', 'Kevin', 'Yogi', 'Noah', 'Bella',
       'Grizzwald', 'Rusty', 'Gus', 'Stanley', 'Alfy', 'Koko', 'Rey',
       'Gary', 'a', 'Elliot', 'Louis', 'Jesse', 'Romeo', 'Bailey',
       'Duddles', 'Jack', 'Emmy', 'Steven', 'Beau', 'Snoopy', 'Shadow',
       'Terrance', 'Aja', 'Penny', 'Dante', 'Nelly', 'Ginger', 'Benedict',
       'Venti', 'Goose', 'Nugget', 'Cash', 'Coco', 'Jed', 'Sebastian',
       'Walter', 'Sierra', 'Monkey', 'Harry', 'Kody', 'Lassie', 'Rover',
       'Napolean', 'Dawn', 'Boomer', 'Cody', 'Rumble', 'Clifford',
       'quite', 'Dewey', 'Scout', 'Gizmo', 'Cooper', 'Harold', 'Shikha',
       'Jamesy', 'Lili', 'Sammy', 'Meatball', 'Paisley', 'Albus',
       'Nept

In [18]:
df_wrd_twitter_old[df_wrd_twitter_old['name'].str.islower()]['name'].unique()

array(['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad', 'an',
       'very', 'just', 'my', 'his', 'actually', 'getting', 'this',
       'unacceptable', 'all', 'old', 'infuriating', 'the', 'by',
       'officially', 'life', 'light', 'space'], dtype=object)

In [19]:
df_image_predictions_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [20]:
df_image_predictions_old['p1'].unique()

array(['Welsh_springer_spaniel', 'redbone', 'German_shepherd',
       'Rhodesian_ridgeback', 'miniature_pinscher',
       'Bernese_mountain_dog', 'box_turtle', 'chow', 'shopping_cart',
       'miniature_poodle', 'golden_retriever', 'Gordon_setter',
       'Walker_hound', 'pug', 'bloodhound', 'Lhasa', 'English_setter',
       'hen', 'desktop_computer', 'Italian_greyhound', 'Maltese_dog',
       'three-toed_sloth', 'ox', 'malamute', 'guinea_pig',
       'soft-coated_wheaten_terrier', 'Chihuahua',
       'black-and-tan_coonhound', 'coho', 'toy_terrier',
       'Blenheim_spaniel', 'Pembroke', 'llama',
       'Chesapeake_Bay_retriever', 'curly-coated_retriever', 'dalmatian',
       'Ibizan_hound', 'Border_collie', 'Labrador_retriever', 'seat_belt',
       'snail', 'miniature_schnauzer', 'Airedale', 'triceratops', 'swab',
       'hay', 'hyena', 'jigsaw_puzzle', 'West_Highland_white_terrier',
       'toy_poodle', 'giant_schnauzer', 'vizsla', 'vacuum', 'Rottweiler',
       'Siberian_husky', 't

In [21]:
df_image_predictions_old['p2'].unique()

array(['collie', 'miniature_pinscher', 'malinois', 'redbone',
       'Rottweiler', 'English_springer', 'mud_turtle', 'Tibetan_mastiff',
       'shopping_basket', 'komondor', 'Yorkshire_terrier',
       'English_foxhound', 'bull_mastiff', 'German_shepherd', 'Shih-Tzu',
       'Newfoundland', 'cock', 'desk', 'toy_terrier', 'toy_poodle',
       'otter', 'Chesapeake_Bay_retriever', 'Siberian_husky', 'skunk',
       'Afghan_hound', 'bloodhound', 'barracouta', 'papillon',
       'cocker_spaniel', 'chow', 'Irish_terrier', 'chain_saw', 'beagle',
       'giant_schnauzer', 'Labrador_retriever', 'Pembroke', 'Chihuahua',
       'Weimaraner', 'slug', 'Brittany_spaniel', 'standard_schnauzer',
       'teddy', 'armadillo', 'African_hunting_dog', 'vizsla', 'doormat',
       'pug', 'Italian_greyhound', 'Samoyed', 'Pomeranian',
       'miniature_poodle', 'Lakeland_terrier', 'Irish_setter', 'swab',
       'malamute', 'bath_towel', 'Border_collie', 'Leonberg', 'drake',
       'French_bulldog', 'ice_bear', 

In [22]:
df_image_predictions_old['p3'].unique()

array(['Shetland_sheepdog', 'Rhodesian_ridgeback', 'bloodhound',
       'miniature_pinscher', 'Doberman', 'Greater_Swiss_Mountain_dog',
       'terrapin', 'fur_coat', 'golden_retriever',
       'soft-coated_wheaten_terrier', 'Labrador_retriever', 'Pekinese',
       'Ibizan_hound', 'French_bulldog', 'malinois', 'Dandie_Dinmont',
       'borzoi', 'partridge', 'bookcase', 'basenji', 'miniature_poodle',
       'great_grey_owl', 'groenendael', 'Eskimo_dog', 'hamster', 'briard',
       'papillon', 'flat-coated_retriever', 'gar', 'Chihuahua',
       'Shih-Tzu', 'Pomeranian', 'dingo', 'power_drill', 'Saluki',
       'Great_Pyrenees', 'West_Highland_white_terrier', 'collie',
       'toy_poodle', 'vizsla', 'acorn', 'giant_schnauzer', 'teddy',
       'common_iguana', 'wig', 'water_buffalo', 'coyote', 'seat_belt',
       'kelpie', 'space_heater', 'Brabancon_griffon', 'standard_poodle',
       'beagle', 'Irish_water_spaniel', 'bluetick', 'Weimaraner',
       'Chesapeake_Bay_retriever', 'toilet_tiss

In [23]:
df_tweet_json_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2331 non-null   datetime64[ns, UTC]
 1   id                             2331 non-null   int64              
 2   id_str                         2331 non-null   int64              
 3   full_text                      2331 non-null   object             
 4   truncated                      2331 non-null   bool               
 5   display_text_range             2331 non-null   object             
 6   entities                       2331 non-null   object             
 7   extended_entities              2059 non-null   object             
 8   source                         2331 non-null   object             
 9   in_reply_to_status_id          77 non-null     float64            
 10  in_reply_to_status_id_st

In [24]:
df_tweet_json_old.query('retweeted == True')['id'].nunique()

0

In [25]:
df_tweet_json_old.query("retweet_count > 0")['id'].nunique()

2331

In [26]:
df_tweet_json_old.query('favorited == True')['id'].nunique()

0

In [27]:
df_tweet_json_old.query("favorite_count > 0")['id'].nunique()

2168

> **Twitter Reference**:
>
> * Tweet objects: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

> **Key Points:**
> * Only original ratings (no retweets) that have images is wanted
> * The tweets beyond August 1st, 2017 do not need to be gathered.
> * Cleaning includes merging individual pieces of data according to the rules of tidy data
> * The rating numerators are greater than the denominators does not need to be cleaned

> **Quality Issues:**
> * IDs are found to have wrong types
> * Data contains retweets
> * Variable **retweeted** indicates whether this Tweet has been Retweeted by the authenticating user. Variable **retweet_count** indicates number of times this Tweet has been retweeted. The maxium of retweet_count reaches 77154 but the only possible value for retweeted is False
> * The above issue is also found between **favourited** and **favorite_count**
> * Misleading column names in df_image_predictions_old: p1, p1_conf, p1_dog, p2, p2_conf, p2_dog, p3, p3_conf, p3_dog
> * Column name **text** is not descriptive
> * Issues with columns p1, p2, p3
>  * Not all words start with capitalized characters
>  * Some uses "_", and some uses "-"
> * **timestamp** is not of type DateTime. Same as **retweeted_status_timestamp**
> * The **name** column in **df_wrd_twitter_old** contains words other than dog names
> * The **rating_denominator** contains 0, but its corresponding **rating_numerator** is 960

# III. Cleaning Data 
---

# IV. Analyzing Data 
---

# V. Visualing Data 
---