In [1]:
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import os
import re
import logging

from tqdm.notebook import tqdm

In [2]:
'''
Logging Configuration
'''

logs_dir = "./logs"

log_filepath = os.path.join(logs_dir, 'df_wrd_error.log')

logging.basicConfig(filename=log_filepath, filemode='w', format='%(levelname)s - %(message)s')

<a id='toc'></a>
# Table of Contents
---

* [I. Gathering](#gathering)
* [II. Assessing](#assessing)
* [III. Cleaning](#cleaning)
* [IV. Analyzing](#analyzing)
* [III. Visualizing](#visualizing)

<a id='gathering'></a>
# I. Gathering Data 
---

In [3]:
data_dir = "./data"

### a. WeRateDogs Twitter Archive

In [4]:
wrd_data_filepath = 'twitter-archive-enhanced.csv'

df_wrd_twitter_old = pd.read_csv(os.path.join(data_dir, wrd_data_filepath))

### b. Tweet Image Predictions

In [5]:
image_predictions_filepath = os.path.join(data_dir, 'image-predictions.tsv')

In [6]:
'''
Uncomment the following code to download data if file(image-predictions.tsv) does not exist
'''

# url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

# response = requests.get(url, allow_redirects=True)

# with open(image_predictions_filepath, 'wb') as f:
#     f.write(response.content)


'\nUncomment the following code to download data if file(image-predictions.tsv) does not exist\n'

In [7]:
df_image_predictions_old = pd.read_csv(image_predictions_filepath, sep="\t")

### c. Any Additional Data

**Via Twitter API**

In [8]:
json_filepath = os.path.join(data_dir, "tweet_json.txt")

In [9]:
'''
Uncomment the following code to download data if file(tweet_json.txt) does not exist
'''

# with open('twitter-credential.json') as f:
#     credentials = json.load(f)

# auth = tweepy.OAuthHandler(credentials['consumer_key'], credentials['consumer_secret'])

# auth.set_access_token(credentials['access_token'], credentials['access_token_secret'])

# api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

# tweet_ids = df_wrd_twitter_old['tweet_id']

# statuses = []

# for tweet_id in tqdm(tweet_ids):

#     try:

#         status = api.get_status(tweet_id, tweet_mode='extended')

#         statuses.append(status._json)

#     except:

#         continue

# with open(json_filepath, 'w') as outfile:
#     for status in statuses:
#         json.dump(status, outfile)
#         outfile.write("\n")

'\nUncomment the following code to download data if file(tweet_json.txt) does not exist\n'

In [10]:
df_tweet_json_old = pd.read_json(json_filepath, lines=True)

**Via Kaggle** [For all available dog breeds]

Reference: 
    [https://www.kaggle.com/c/dog-breed-identification/data](https://www.kaggle.com/c/dog-breed-identification/data)

In [11]:
df_dog_breeds_old = pd.read_csv(os.path.join(data_dir, "labels.csv"))

<a id='assessing'></a>
# II. Assessing Data
---

In [12]:
'''
Uncomment to view data
'''

# df_wrd_twitter_old

'\nUncomment to view data\n'

> **Summary**
> * name, doggo, floofer, pupper and puppo columns contain "None"
> * name contains "a" as a value. May contain other nonsense name
> * tweet information and dog information in the same table
> * name (refers to dog name) is not very descriptive since tweet information and dog information are in the same table
> * text (refers to status update) is not very descriptive since tweet information and dog information are in the same table

In [13]:
'''
Uncomment to view data
'''

# df_image_predictions_old

'\nUncomment to view data\n'

> **Summary**
> * img_num, p1, p1_conf, p1_dog, etc. are not very descriptive
> * Values of p1, p2, p3 have formatting issues. Some starts with uppercase characters, but others start with lowercase characters

In [14]:
'''
Uncomment to view data
'''

# df_tweet_json_old

'\nUncomment to view data\n'

> **Summary**
> * For each type of id, there are two columns for it for the same piece of information except that their data types are different

In [15]:
'''
Uncomment to view data
'''

# df_dog_breeds_old

'\nUncomment to view data\n'

In [16]:
df_wrd_twitter_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

> **Summary**
> * All ids(tweet_id, in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id)
 have numeric data type instead of string
> * timestamp is of string type
> * Even though some columns have null values, it is reasonable
> * doggo, floofer, pupper and puppo can be combined into a single column
> * Columns may be ignored: 
>   in_reply_to_status_id, in_reply_to_user_id, source, expanded_urls

In [17]:
df_image_predictions_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


> **Summary**
> * The data type of tweet_id is integer
> * No missing values in this table

In [18]:
df_tweet_json_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2331 non-null   datetime64[ns, UTC]
 1   id                             2331 non-null   int64              
 2   id_str                         2331 non-null   int64              
 3   full_text                      2331 non-null   object             
 4   truncated                      2331 non-null   bool               
 5   display_text_range             2331 non-null   object             
 6   entities                       2331 non-null   object             
 7   extended_entities              2059 non-null   object             
 8   source                         2331 non-null   object             
 9   in_reply_to_status_id          77 non-null     float64            
 10  in_reply_to_status_id_st

> **Summary**
> * Columns may be ignored: 
>   * created_at, truncated, in_reply_to_user_id, in_reply_to_status_id_str, in_reply_to_user_id
>   * in_reply_to_user_id_str, in_reply_to_screen_name, is_quote_status, possibly_sensitive
>   * possibly_sensitive_appealable, lang, quoted_status_id, quoted_status_id_str, quoted_status_permalink
>   * quoted_status, geo, coordinates, place
> * id is of numeric type

In [19]:
df_dog_breeds_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10222 entries, 0 to 10221
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10222 non-null  object
 1   breed   10222 non-null  object
dtypes: object(2)
memory usage: 159.8+ KB


#### ****************************** Further Investigation On Data  ******************************

In [20]:
df_wrd_twitter_old.duplicated(['tweet_id']).sum()

0

In [21]:
df_wrd_twitter_old.duplicated(['name']).sum()

1399

In [22]:
df_wrd_twitter_old['name'].value_counts().to_dict()

{'None': 745,
 'a': 55,
 'Charlie': 12,
 'Lucy': 11,
 'Cooper': 11,
 'Oliver': 11,
 'Tucker': 10,
 'Penny': 10,
 'Lola': 10,
 'Bo': 9,
 'Winston': 9,
 'Sadie': 8,
 'the': 8,
 'Bailey': 7,
 'Buddy': 7,
 'Toby': 7,
 'an': 7,
 'Daisy': 7,
 'Jax': 6,
 'Rusty': 6,
 'Jack': 6,
 'Scout': 6,
 'Oscar': 6,
 'Dave': 6,
 'Milo': 6,
 'Leo': 6,
 'Bella': 6,
 'Stanley': 6,
 'Koda': 6,
 'Alfie': 5,
 'Chester': 5,
 'Bentley': 5,
 'Sunny': 5,
 'Louis': 5,
 'Oakley': 5,
 'Gus': 5,
 'Finn': 5,
 'Sammy': 5,
 'Phil': 5,
 'very': 5,
 'George': 5,
 'Larry': 5,
 'Jerry': 4,
 'Clarence': 4,
 'one': 4,
 'Bruce': 4,
 'Duke': 4,
 'Reginald': 4,
 'Archie': 4,
 'Ruby': 4,
 'Bear': 4,
 'Gary': 4,
 'Luna': 4,
 'Jeffrey': 4,
 'Boomer': 4,
 'just': 4,
 'Beau': 4,
 'Scooter': 4,
 'Derek': 4,
 'Maximus': 4,
 'Gerald': 4,
 'Loki': 4,
 'quite': 4,
 'Clark': 4,
 'Dexter': 4,
 'Maggie': 4,
 'Moose': 4,
 'Winnie': 4,
 'Hank': 4,
 'Sophie': 4,
 'Sampson': 4,
 'Maddie': 4,
 'Walter': 4,
 'Riley': 4,
 'Reggie': 4,
 'Cassie': 4,
 

> **Summary**
> * Have invalid names other than 'None', eg: a, an, the, one. All starts with lowercase characters.
> * Name column has duplicated entries

In [23]:
p = re.compile('[0-9]+/10')
p_invalid = re.compile('[0-9]+/[0-9]+/[0-9]+')

bad_count = 0
none_count = 0
good_count = 0


for text, denominator, numerator in zip(df_wrd_twitter_old['text'].values, df_wrd_twitter_old['rating_denominator'].values, df_wrd_twitter_old['rating_numerator'].values):
    result = p.search(text)
    if result is not None and denominator != 10: # when denominators and numerators are wrong
        message = "{} {} {} ************************ {}".format(result.group(), numerator, denominator, text)
        logging.error(message)
        logging.error("\n")
        bad_count += 1
    elif result is None and p_invalid.search(text) == None:
#         print(numerator, denominator, "~~~~~~~~~~~~~~~~~~~~~~~", text)
#         print()
        none_count += 1
    elif result is not None and denominator == 10:
        good_count += 1
    elif p_invalid.search(text) != None: # when parsing date as ratings
        message = "{} {} ************************ {}".format(numerator, denominator, text)
        logging.error(message)
        logging.error("\n")


print("Number of true positive ratings: ", good_count)
print("Number of wrong ratings: ", bad_count)
print("Please check for logs for wrong ratings")


Number of true positive ratings:  2333
Number of wrong ratings:  7
Please check for logs for wrong ratings


In [24]:
text_to_remove = []

for text, denominator, numerator in zip(
    df_wrd_twitter_old['text'].values, 
    df_wrd_twitter_old['rating_denominator'].values, 
    df_wrd_twitter_old['rating_numerator'].values):
    
    result = p.search(text)
    
    if result is not None and denominator != 10:
        text_to_remove.append(text)
        
df_wrd_twitter_old[df_wrd_twitter_old.text.isin(text_to_remove)][
    ['text', 'rating_denominator', 'rating_numerator']]

Unnamed: 0,text,rating_denominator,rating_numerator
313,@jonnysun @Lin_Manuel ok jomny I know you're e...,0,960
784,"RT @dog_rates: After so many requests, this is...",11,9
1068,"After so many requests, this is Bretagne. She ...",11,9
1165,Happy 4/20 from the squad! 13/10 for all https...,20,4
1202,This is Bluebert. He just saw that both #Final...,50,50
1662,This is Darrel. He just robbed a 7/11 and is i...,11,7
2335,This is an Albanian 3 1/2 legged Episcopalian...,2,1


> **Summary**
> * There are 7 rows in df_wrd_twitter_old with wrong denominators and numerators
> * There is 2 rows parsing time as ratings: 1) 24/7 2) 11/15/15

In [25]:
df_wrd_twitter_old.isnull().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

In [26]:
df_wrd_twitter_old['rating_denominator'].unique()

array([ 10,   0,  15,  70,   7,  11, 150, 170,  20,  50,  90,  80,  40,
       130, 110,  16, 120,   2])

In [27]:
df_image_predictions_old.duplicated().sum()

0

In [28]:
df_image_predictions_old.isnull().sum()

tweet_id    0
jpg_url     0
img_num     0
p1          0
p1_conf     0
p1_dog      0
p2          0
p2_conf     0
p2_dog      0
p3          0
p3_conf     0
p3_dog      0
dtype: int64

In [29]:
# df_tweet_json_old.duplicated().sum()

'''
Have TypeError after executing the above statement
- TypeError: unhashable type: 'list'
'''

"\nHave TypeError after executing the above statement\n- TypeError: unhashable type: 'list'\n"

In [30]:
df_tweet_json_old.isnull().sum()

created_at                          0
id                                  0
id_str                              0
full_text                           0
truncated                           0
display_text_range                  0
entities                            0
extended_entities                 272
source                              0
in_reply_to_status_id            2254
in_reply_to_status_id_str        2254
in_reply_to_user_id              2254
in_reply_to_user_id_str          2254
in_reply_to_screen_name          2254
user                                0
geo                              2331
coordinates                      2331
place                            2330
contributors                     2331
is_quote_status                     0
retweet_count                       0
favorite_count                      0
favorited                           0
retweeted                           0
possibly_sensitive                135
possibly_sensitive_appealable     135
lang        

In [31]:
df_dog_breeds_old.duplicated(['breed']).sum()

10102

In [32]:
df_dog_breeds_old.isnull().sum()

id       0
breed    0
dtype: int64

<a id='cleaning'></a>
# III. Cleaning Data 
---

In [33]:
df_wrd_twitter = df_wrd_twitter_old.copy()

In [34]:
df_image_predictions = df_image_predictions_old.copy()

In [35]:
df_tweet_json = df_tweet_json_old.copy()

In [36]:
df_dog_breeds = df_dog_breeds_old.copy()

---
---
---

> Filter out retweets by removing the ones with retweeted_status_id

In [37]:
indices_with_retweets = df_wrd_twitter[df_wrd_twitter['retweeted_status_id'].notnull()].index

df_wrd_twitter.drop(indices_with_retweets, inplace=True)

In [38]:
df_wrd_twitter['retweeted_status_id'].unique()

array([nan])

> Remove rows with inaccurate ratings
> * Get all texts where denominator and numerator are wrong
> * Get all rows in the dataframe by matching texts
> * Remove these rows in the dataframe by indices

In [39]:
text_to_remove = []

for text, denominator, numerator in zip(
    df_wrd_twitter['text'].values, 
    df_wrd_twitter['rating_denominator'].values, 
    df_wrd_twitter['rating_numerator'].values):
    
    result = p.search(text)
    
    if result is not None and denominator != 10:
        text_to_remove.append(text)
    elif text.find('24/7') != -1 or text.find('11/15/15') != -1:
        text_to_remove.append(text)
        
len(text_to_remove)

8

In [40]:
indices_with_wrong_ratings = df_wrd_twitter[df_wrd_twitter.text.isin(text_to_remove)].index

df_wrd_twitter.drop(indices_with_wrong_ratings, inplace=True)

In [41]:
text_to_remove = []

for text, denominator, numerator in zip(
    df_wrd_twitter['text'].values, 
    df_wrd_twitter['rating_denominator'].values, 
    df_wrd_twitter['rating_numerator'].values):
    
    result = p.search(text)
    
    if result is not None and denominator != 10:
        text_to_remove.append(text)
    elif text.find('24/7') != -1 or text.find('11/15/15') != -1:
        text_to_remove.append(text)
        
len(text_to_remove)

0

> 