In [1]:
import pandas as pd
import requests
import tweepy
import json
import os

------
<a id="indice"></a>

# Índice

1. **[Coleta](#coleta)**
    * [Twitter Archive](#coleta:twitter-archive)
    * [Image Prediction](#coleta:image-prediction)
    * [Twitter API](#coleta:twitter-api)
    
* **[Avaliação](#avaliacao)**
    * [Twitter Archive](#avaliacao:twitter-archive)
    * [Image Prediction](#avaliacao:image-prediction)
    * [Twitter API](#avaliacao:twitter-api)


------
<a id="coleta"></a>

# Coleta

<a id="coleta:twitter-archive"></a>

## Coleta: Twitter archive

In [2]:
df_twitter_arc = pd.read_csv('data/twitter-archive-enhanced.csv')
df_twitter_arc.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1349,704134088924532736,,,2016-02-29 02:40:23 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This sneezy pupper is just adorable af. 12/10 ...,,,,https://vine.co/v/igW2OEwu9vg,12,10,,,,pupper,
619,796149749086875649,,,2016-11-09 00:37:46 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Ruby. She just turned on the news. Off...,,,,https://twitter.com/dog_rates/status/796149749...,11,10,Ruby,,,,


<a id="coleta:image-prediction"></a>

## Coleta: Image prediction

In [3]:
df_prediction = None

r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

if r.status_code is 200:
    df_prediction = pd.read_csv(pd.compat.StringIO(r.text), sep='\t')    
else:    
    print('ERROR: Image prediction request returned {status_code} status code.'.format(status_code = r.status_code))

In [4]:
df_prediction.sample(2)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
682,683834909291606017,https://pbs.twimg.com/ext_tw_video_thumb/68383...,1,Maltese_dog,0.738449,True,toy_poodle,0.102992,True,Samoyed,0.023247,True
409,673715861853720576,https://pbs.twimg.com/media/CVmE_fAWIAAlDhU.jpg,1,suit,0.404115,False,bow_tie,0.294683,False,Windsor_tie,0.132701,False


<a id="coleta:twitter-api"></a>

## Coleta: Twitter API

In [5]:
# Twitter APP Config
with open('twitter_config.json', 'r', encoding='utf-8') as file:
    app_config = json.load(file)

In [6]:
# Twitter API settings
api_key = app_config['api_key']
api_secret = app_config['api_secret']
access_token = app_config['access_token']
access_secret = app_config['access_secret']

In [7]:
# Connect to Twitter API
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [8]:
json_path = 'data/tweet_json.txt'
error_log = 'data/tweet_error.log'

# Save tweets from Twitter API
if(not os.path.isfile(json_path)):
    for tweet_id in df_twitter_arc.tweet_id:
        try:
            status = api.get_status(tweet_id)

            with open(json_path, 'a', newline='\n') as file:
                file.write(f'{json.dumps(status._json)}\n')

        except Exception as err:
            with open(error_log, 'a', newline='\n') as log:
                log.write(f'{str(tweet_id)}: {str(err.args[0])}\n')

            print(f'{str(tweet_id)}: {str(err.args[0])}')
else:
    print('Dados já salvos em disco, não serão executadas novas requisições à API.')
        

Dados já salvos em disco, não serão executadas novas requisições à API.


In [9]:
# Convert JSON data into a Dataframe
with open(json_path) as file:
    pd.read_json

------
<a id="avaliacao"></a>

# Avaliação


<a id="avaliacao:twitter-archive"></a>

## Avaliação: Twitter data archive

In [10]:
 df_twitter_arc.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
509,812466873996607488,,,2016-12-24 01:16:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Mary. She's desperately trying to recr...,,,,https://twitter.com/dog_rates/status/812466873...,12,10,Mary,,,,
343,832040443403784192,,,2017-02-16 01:34:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Klein. These pics were ...,7.699404e+17,4196984000.0,2016-08-28 16:51:16 +0000,https://twitter.com/dog_rates/status/769940425...,12,10,Klein,,,,
682,788552643979468800,,,2016-10-19 01:29:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Say hello to mad pupper. You kn...,7.363926e+17,4196984000.0,2016-05-28 03:04:00 +0000,"https://vine.co/v/iEggaEOiLO3,https://vine.co/...",13,10,mad,,,pupper,
1700,680959110691590145,,,2015-12-27 03:51:18 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Ozzie. He was doing fine until he lost...,,,,https://twitter.com/dog_rates/status/680959110...,9,10,Ozzie,,,,
1557,688804835492233216,,,2016-01-17 19:27:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you stumble but recover quickly cause you...,,,,https://twitter.com/dog_rates/status/688804835...,12,10,,,,,


In [11]:
df_twitter_arc.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


<a id="twitter-arc:info"></a>
* Dados incompletos, contém apenas 2356 registros dos 5000 anunciados
* `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id`, `retweeted_status_timestamp` em `float`

In [33]:
df_twitter_arc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [13]:
df_twitter_arc.tweet_id.duplicated().value_counts()

False    2356
Name: tweet_id, dtype: int64

In [14]:
df_twitter_arc.text.duplicated().value_counts()

False    2356
Name: text, dtype: int64

<a id="twitter-arc:source"></a>
Variável `source` com valores repetidos e sem valor para a unidade de observação

In [15]:
df_twitter_arc.source.sample(5)

876     <a href="http://twitter.com/download/iphone" r...
2184    <a href="http://twitter.com/download/iphone" r...
1105    <a href="http://twitter.com/download/iphone" r...
815     <a href="http://twitter.com/download/iphone" r...
726     <a href="http://twitter.com/download/iphone" r...
Name: source, dtype: object

<a id="twitter-arc:category"></id>

Uma variável em várias colunas

In [16]:
df_twitter_arc.loc[:,'doggo':].nunique()

doggo      2
floofer    2
pupper     2
puppo      2
dtype: int64

In [17]:
df_twitter_arc[df_twitter_arc.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [18]:
df_twitter_arc[df_twitter_arc.loc[:, ['in_reply_to_status_id', 'in_reply_to_user_id']].notnull()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      0 non-null float64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     0 non-null object
source                        0 non-null object
text                          0 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 0 non-null object
rating_numerator              0 non-null float64
rating_denominator            0 non-null float64
name                          0 non-null object
doggo                         0 non-null object
floofer                       0 non-null object
pupper                        0 non-null object
puppo                         0 non-null object
dtypes: float64(7), object(10)
memory usage: 313.0+ KB


In [19]:
df_twitter_arc.loc[:, ['in_reply_to_status_id', 'in_reply_to_user_id']].nunique()

in_reply_to_status_id    77
in_reply_to_user_id      31
dtype: int64

<a id="twitter-arc:expanded_urls"></a>

Registros de `expanded_urls` com URLs inválidas, não existentes

In [37]:
# URLs sem conteúdo
df_twitter_arc.expanded_urls.sample(10)

1402    https://twitter.com/dog_rates/status/699423671...
1543    https://twitter.com/dog_rates/status/689557536...
266     https://twitter.com/dog_rates/status/817423860...
1787    https://twitter.com/dog_rates/status/677565715...
413     https://twitter.com/dog_rates/status/822872901...
399     https://twitter.com/dog_rates/status/795076730...
2064    https://twitter.com/dog_rates/status/671154572...
516     https://www.gofundme.com/sams-smile,https://tw...
418     https://twitter.com/dog_rates/status/822462944...
1583    https://twitter.com/dog_rates/status/687102708...
Name: expanded_urls, dtype: object

In [38]:
df_twitter_arc.expanded_urls.isnull().value_counts()

False    2297
True       59
Name: expanded_urls, dtype: int64

<a id="twitter-arc:name"></id>

Nomes não preenchidos com valor `None`

In [39]:
df_twitter_arc.name.isnull().value_counts()

False    2356
Name: name, dtype: int64

In [32]:
df_twitter_arc[df_twitter_arc.name == 'None']

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,13,10,,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
12,889665388333682689,,,2017-07-25 01:55:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a puppo that seems to be on the fence a...,,,,https://twitter.com/dog_rates/status/889665388...,13,10,,,,,puppo
24,887343217045368832,,,2017-07-18 16:08:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",You may not have known you needed to see this ...,,,,https://twitter.com/dog_rates/status/887343217...,13,10,,,,,
25,887101392804085760,,,2017-07-18 00:07:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This... is a Jubilant Antarctic House Bear. We...,,,,https://twitter.com/dog_rates/status/887101392...,12,10,,,,,
30,886267009285017600,8.862664e+17,2.281182e+09,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,,,,12,10,,,,,
32,886054160059072513,,,2017-07-15 02:45:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @Athletics: 12/10 #BATP https://t.co/WxwJmv...,8.860537e+17,19607400.0,2017-07-15 02:44:07 +0000,https://twitter.com/dog_rates/status/886053434...,12,10,,,,,
35,885518971528720385,,,2017-07-13 15:19:09 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I have a new hero and his name is Howard. 14/1...,,,,https://twitter.com/4bonds2carbon/status/88551...,14,10,,,,,
37,885167619883638784,,,2017-07-12 16:03:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a corgi undercover as a malamute....,,,,https://twitter.com/dog_rates/status/885167619...,13,10,,,,,
41,884441805382717440,,,2017-07-10 15:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...","I present to you, Pup in Hat. Pup in Hat is gr...",,,,https://twitter.com/dog_rates/status/884441805...,14,10,,,,,


------
<a id="avaliacao:image-prediction"></a>

## Avaliação: Image prediction

<a id="image-prediction:p-values"></id>

Valores não padronizados para as variáveis `p1`, `p2` e `p3`

In [20]:
df_prediction.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1589,798628517273620480,https://pbs.twimg.com/media/CUN4Or5UAAAa5K4.jpg,1,beagle,0.636169,True,Labrador_retriever,0.119256,True,golden_retriever,0.082549,True
1234,746369468511756288,https://pbs.twimg.com/media/ClujESVXEAA4uH8.jpg,1,German_shepherd,0.622957,True,malinois,0.338884,True,wallaby,0.024161,False
1835,837012587749474308,https://pbs.twimg.com/media/C52pYJXWgAA2BEf.jpg,1,toilet_tissue,0.186387,False,cowboy_hat,0.158555,False,sombrero,0.14947,False
724,686034024800862208,https://pbs.twimg.com/media/CYVIToGWQAAEZ_y.jpg,1,Great_Dane,0.23692,True,Irish_wolfhound,0.117608,True,Greater_Swiss_Mountain_dog,0.1039,True
1884,847606175596138505,https://pbs.twimg.com/media/C8NNUDBUMAE0XxJ.jpg,1,Cardigan,0.413688,True,Boston_bull,0.381836,True,doormat,0.065868,False
946,704499785726889984,https://pbs.twimg.com/media/Ccbi0UGWoAA4fwg.jpg,1,Chihuahua,0.376541,True,Siamese_cat,0.098057,False,Labrador_retriever,0.085211,True
1950,863079547188785154,https://pbs.twimg.com/media/C_pGRInUwAAmTY_.jpg,1,Lakeland_terrier,0.275242,True,Airedale,0.190569,True,teddy,0.102595,False
384,673343217010679808,https://pbs.twimg.com/media/CVgyFSyU4AA9p1e.jpg,1,Chihuahua,0.541408,True,Italian_greyhound,0.156891,True,miniature_pinscher,0.069556,True
1539,790946055508652032,https://pbs.twimg.com/media/CvoBPWRWgAA4het.jpg,1,dishwasher,0.700466,False,golden_retriever,0.245773,True,chow,0.039012,True
1247,747600769478692864,https://pbs.twimg.com/media/CmAC7ehXEAAqSuW.jpg,1,Chesapeake_Bay_retriever,0.804363,True,Weimaraner,0.054431,True,Labrador_retriever,0.043268,True


In [21]:
df_prediction.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [22]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [23]:
df_prediction.tweet_id.nunique()

2075

<a id="image-prediction:duplicated-urls"></a>

Imagens duplicadadas

In [24]:
df_prediction.jpg_url.duplicated().value_counts()

False    2009
True       66
Name: jpg_url, dtype: int64

In [25]:
df_prediction[df_prediction.tweet_id.duplicated()].info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 12 columns):
tweet_id    0 non-null int64
jpg_url     0 non-null object
img_num     0 non-null int64
p1          0 non-null object
p1_conf     0 non-null float64
p1_dog      0 non-null bool
p2          0 non-null object
p2_conf     0 non-null float64
p2_dog      0 non-null bool
p3          0 non-null object
p3_conf     0 non-null float64
p3_dog      0 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 0.0+ bytes


In [26]:
df_prediction.loc[:, ['p1', 'p2', 'p3']].nunique()

p1    378
p2    405
p3    408
dtype: int64

------
<a id="avaliacao:twitter-api"></a>

## Avaliação Twitter API Requests

------
<a id="assses-notes"></a>

### Notas de avaliação
#### Qualidade
##### `df_twitter_arc` - Twitter archive
* [Base incompleta, apenas **2356** registros de **5000** foram disponibilizados](#twitter-arc:info)
* [Coluna `source` apresenta tags HTML](#twitter-arc:source)
* [Coluna `source` não acrescenta valor para a unidade de observação](#twitter-arc:source)
* [Nomes (`name`) não preenchidos com valor literal `'None'`](#twitter-arc:name)
* [Coluna `expanded_urls` com linhas não preenchidas](#twitter-arc:expanded_urls)
* [Endereços inválidos em `expanded_urls`](#twitter-arc:expanded_urls)


##### `df_prediction` - Image prediction
* [Previsões [`p1`, `p2`, `p3`] com nomes não padronizados: Nomes capitalizados, espaços substituídos por `'_'`](#image-prediction:p-values)
* [Previsões repetidas sobre a mesma imagem](#image-prediction:duplicated-urls)


#### Organização
##### `df_twitter_arc` - Twitter archive
* [As colunas `in_reply_to_status_id`, `in_reply_to_user_id`, `retweet_status_id`, `retweet_status_user_id` estão em `float64` e deveriam ser em `int64`](#twitter-arc:info)
* [As colunas `timestamp` e `retweet_status_timestamp` estão em `string` e deveriam ser do tipo `datetime`](#twitter-arc:info)
* [Uma variável em várias colunas, `doggo`, `floofer`, `pupper` e `puppo`](#twitter-arc:category)


* -- Uma variável em várias colunas em `p1`, `p2`, `p3` e respectivas variáveis relacionadas, `p#_conf`, `p#_dog`.



# Limpeza

# Armazenamento

# Relatórios

* Data wrangling efforts
* Analyses and visualizations