In [2]:
import pandas as pd
import requests
import tweepy
import json
import os

------
<a id="indice"></a>

# Índice

1. [Coleta](#coleta)
    * [Twitter Archive](#coleta:twitter-archive)
    * [Image Prediction](#coleta:image-prediction)
    * [Twitter API](#coleta:twitter-api)
* [Avaliação](#avaliacao)

------
<a id="coleta"></a>

# Coleta

<a id="coleta:twitter-archive"></a>

## Twitter archive

In [3]:
df_twitter_arc = pd.read_csv('data/twitter-archive-enhanced.csv')
df_twitter_arc.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1606,685667379192414208,,,2016-01-09 03:40:16 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Marty. He has no idea what happened he...,,,,https://twitter.com/dog_rates/status/685667379...,9,10,Marty,,,pupper,
1631,684481074559381504,,,2016-01-05 21:06:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Pippa. She's an Elfin High Feta. Compact ...,,,,https://twitter.com/dog_rates/status/684481074...,10,10,Pippa,,,,


<a id="coleta:image-prediction"></a>

## Image prediction

In [4]:
df_prediction = None

r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

if r.status_code is 200:
    df_prediction = pd.read_csv(pd.compat.StringIO(r.text), sep='\t')    
else:    
    print('ERROR: Image prediction request returned {status_code} status code.'.format(status_code = r.status_code))

In [5]:
df_prediction.sample(2)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
726,686358356425093120,https://pbs.twimg.com/media/CYZvRttWYAE_RXc.jpg,1,pug,0.985237,True,bull_mastiff,0.008841,True,boxer,0.002321,True
2042,885528943205470208,https://pbs.twimg.com/media/DEoH3yvXgAAzQtS.jpg,1,pug,0.369275,True,Labrador_retriever,0.265835,True,kuvasz,0.134697,True


<a id="coleta:twitter-api"></a>

## Twitter API

In [6]:
# Twitter APP Config
with open('twitter_config.json', 'r', encoding='utf-8') as file:
    app_config = json.load(file)

In [7]:
# Twitter API settings
api_key = app_config['api_key']
api_secret = app_config['api_secret']
access_token = app_config['access_token']
access_secret = app_config['access_secret']

In [8]:
# Connect to Twitter API
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [19]:
json_path = 'data/tweet_json.txt'
error_log = 'data/tweet_error.log'

# Save tweets from Twitter API
if(not os.path.isfile(json_path)):
    for tweet_id in df_twitter_arc.tweet_id:
        try:
            status = api.get_status(tweet_id)

            with open(json_path, 'a', newline='\n') as file:
                file.write(f'{json.dumps(status._json)}\n')

        except Exception as err:
            with open(error_log, 'a', newline='\n') as log:
                log.write(f'{str(tweet_id)}: {str(err.args[0])}\n')

            print(f'{str(tweet_id)}: {str(err.args[0])}')
else:
    print('Dados já salvos em disco, não serão executadas novas requisições à API.')
        

Dados já salvos em disco, não serão executadas novas requisições à API.


In [None]:
# Convert JSON data into a Dataframe
with open(json_path) as file:
    pd.read_json

------
<a id="avaliacao"></a>
# Avaliação


### Avaliação: Twitter data archive

In [53]:
 df_twitter_arc.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
743,780459368902959104,,,2016-09-26 17:29:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...","This is Bear. Don't worry, he's not a real bea...",,,,https://twitter.com/dog_rates/status/780459368...,11,10,Bear,,,,
447,819015331746349057,,,2017-01-11 02:57:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Sunny. She was also a v...,8.190064e+17,4196984000.0,2017-01-11 02:21:57 +0000,https://twitter.com/dog_rates/status/819006400...,14,10,Sunny,doggo,,,
1688,681523177663676416,,,2015-12-28 17:12:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Carly. She's actually 2 dogs fused tog...,,,,https://twitter.com/dog_rates/status/681523177...,12,10,Carly,,,,
1849,675781562965868544,,,2015-12-12 20:57:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Buddah. He was Waldo for Hallowee...,,,,https://twitter.com/dog_rates/status/675781562...,11,10,Buddah,,,,
1181,719332531645071360,,,2016-04-11 01:13:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Pippa. She managed to start the car bu...,,,,https://twitter.com/dog_rates/status/719332531...,11,10,Pippa,,,,


In [11]:
df_twitter_arc.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


<a id="twitter-arc:incompleto"></a>
Dados incompletos, contém apenas 2356 registros dos 5000 anunciados

In [12]:
df_twitter_arc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [13]:
df_twitter_arc.tweet_id.duplicated().value_counts()

False    2356
Name: tweet_id, dtype: int64

In [39]:
df_twitter_arc.text.duplicated().value_counts()

False    2356
Name: text, dtype: int64

<a id="twitter-arc:source"></a>
Variável `source` com valores repetidos e sem valor para a unidade de observação

In [66]:
df_twitter_arc.source.sample(5)

361     <a href="http://twitter.com/download/iphone" r...
913     <a href="http://twitter.com/download/iphone" r...
1024    <a href="http://twitter.com/download/iphone" r...
34      <a href="http://twitter.com/download/iphone" r...
2222    <a href="http://twitter.com/download/iphone" r...
Name: source, dtype: object

In [14]:
df_twitter_arc.loc[:,'doggo':].nunique()

doggo      2
floofer    2
pupper     2
puppo      2
dtype: int64

In [23]:
df_twitter_arc[df_twitter_arc.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [28]:
df_twitter_arc[df_twitter_arc.loc[:, ['in_reply_to_status_id', 'in_reply_to_user_id']].notnull()].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      0 non-null float64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     0 non-null object
source                        0 non-null object
text                          0 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 0 non-null object
rating_numerator              0 non-null float64
rating_denominator            0 non-null float64
name                          0 non-null object
doggo                         0 non-null object
floofer                       0 non-null object
pupper                        0 non-null object
puppo                         0 non-null object
dtypes: float64(7), object(10)
memory usage: 313.0+ KB


In [37]:
df_twitter_arc.loc[:, ['in_reply_to_status_id', 'in_reply_to_user_id']].nunique()

in_reply_to_status_id    77
in_reply_to_user_id      31
dtype: int64

------

### Avaliação: Image prediction

In [16]:
df_prediction.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
282,671134062904504320,https://pbs.twimg.com/media/CVBY3e7XIAAAE4Y.jpg,1,Shih-Tzu,0.18038,True,golden_retriever,0.180194,True,Labrador_retriever,0.173656,True
1315,754874841593970688,https://pbs.twimg.com/media/CWza7kpWcAAdYLc.jpg,1,pug,0.272205,True,bull_mastiff,0.25153,True,bath_towel,0.116806,False
2005,877556246731214848,https://pbs.twimg.com/media/DC20wEcW0AAf59m.jpg,1,basset,0.995368,True,Welsh_springer_spaniel,0.001936,True,bathtub,0.000468,False
1139,729463711119904772,https://pbs.twimg.com/media/Ch-TXpFXAAAwPGf.jpg,1,German_shepherd,0.829307,True,Doberman,0.0225,True,basenji,0.02119,True
1295,751937170840121344,https://pbs.twimg.com/media/Cm9q2d3XEAAqO2m.jpg,1,Lakeland_terrier,0.424168,True,teddy,0.260562,False,golden_retriever,0.127432,True
1020,710153181850935296,https://pbs.twimg.com/media/Cdr4jO2UAAAIo6W.jpg,2,cowboy_hat,0.979053,False,sombrero,0.010682,False,cocker_spaniel,0.002713,True
1305,753375668877008896,https://pbs.twimg.com/media/CnSHLFeWgAAwV-I.jpg,1,bluetick,0.360071,True,crutch,0.134816,False,tripod,0.098207,False
672,683142553609318400,https://pbs.twimg.com/media/CXsChyjW8AQJ16C.jpg,1,Leonberg,0.605851,True,chow,0.18347,True,German_shepherd,0.079662,True
284,671141549288370177,https://pbs.twimg.com/media/CVBfrU9WUAApDeV.jpg,1,guinea_pig,0.387728,False,wood_rabbit,0.171681,False,borzoi,0.075358,True
60,667138269671505920,https://pbs.twimg.com/media/CUImtzEVAAAZNJo.jpg,1,West_Highland_white_terrier,0.747713,True,Samoyed,0.243629,True,toy_poodle,0.001804,True


In [64]:
df_prediction.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [15]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [20]:
df_prediction.tweet_id.nunique()

2075

In [63]:
df_prediction.jpg_url.duplicated().value_counts()

False    2009
True       66
Name: jpg_url, dtype: int64

In [55]:
df_prediction[df_prediction.tweet_id.duplicated()].info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 12 columns):
tweet_id    0 non-null int64
jpg_url     0 non-null object
img_num     0 non-null int64
p1          0 non-null object
p1_conf     0 non-null float64
p1_dog      0 non-null bool
p2          0 non-null object
p2_conf     0 non-null float64
p2_dog      0 non-null bool
p3          0 non-null object
p3_conf     0 non-null float64
p3_dog      0 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 0.0+ bytes


In [18]:
df_prediction.loc[:, ['p1', 'p2', 'p3']].nunique()

p1    378
p2    405
p3    408
dtype: int64

------

### Notas de avaliação
#### Qualidade
##### `df_twitter_arc` - Twitter archive
* [Base incompleta, apenas **2356** registros de **5000** foram disponibilizados](#twitter-arc:incompleto)
* Coluna `source` apresenta tags HTML
* Coluna `source` com valores repetidos e sem valor para a unidade de observação
* Há nomes não preenchidos
* Coluna `expanded_urls` com linhas não preenchidas


##### `df_prediction` - Image prediction
* Previsões [`p1`, `p2`, `p3`] com nomes não padronizados - Nomes capitalizados, espaços substituídos por '_'
* Previsões repetidas sobre a mesma imagem


#### Organização
##### `df_twitter_arc` - Twitter archive
* As colunas `in_reply_to_status_id`, `in_reply_to_user_id`, `retweet_status_id`, `retweet_status_user_id` estão em `float64` e deveriam ser em `int64`
* As colunas `timestamp` e `retweet_status_timestamp` estão em `string` e deveriam ser do tipo `datetime`
* Uma variável em três colunas. As colunas `doggo`, `floofer`, `pupper` e `puppo` deveriam ser apenas `category`.


* -- Uma variável em várias colunas em `p1`, `p2`, `p3` e respectivas variáveis relacionadas, `p#_conf`, `p#_dog`.



# Limpeza

# Armazenamento

# Relatórios

* Data wrangling efforts
* Analyses and visualizations