In [1]:
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import os
from IPython.display import clear_output

------
<a id="indice"></a>

# Índice

1. **[Coleta](#coleta)**
    * [Twitter Archive](#coleta:twitter-archive)
    * [Image Prediction](#coleta:image-prediction)
    * [Twitter API](#coleta:twitter-api)
    
* **[Avaliação](#avaliacao)**
    * [Twitter Archive](#avaliacao:twitter-archive)
    * [Image Prediction](#avaliacao:image-prediction)
    * [Twitter API](#avaliacao:twitter-api)


------
<a id="coleta"></a>

# Coleta

<a id="coleta:twitter-archive"></a>

## Coleta: Twitter archive

In [2]:
df_twitter_arc = pd.read_csv('data/twitter-archive-enhanced.csv')
df_twitter_arc.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
622,796080075804475393,,,2016-11-08 20:00:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Yogi. He's 98% floof. Snuggable af. 12...,,,,https://twitter.com/dog_rates/status/796080075...,12,10,Yogi,,,,
1200,716730379797970944,,,2016-04-03 20:53:33 +0000,"<a href=""http://twitter.com/download/iphone"" r...",There has clearly been a mistake. Pup did noth...,,,,https://twitter.com/chpsanfrancisco/status/716...,12,10,,,,,


<a id="coleta:image-prediction"></a>

## Coleta: Image prediction

In [3]:
df_prediction = None

r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

if r.status_code is 200:
    df_prediction = pd.read_csv(pd.compat.StringIO(r.text), sep='\t')    
else:    
    print('ERROR: Image prediction request returned {status_code} status code.'.format(status_code = r.status_code))

In [4]:
df_prediction.sample(2)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1159,733828123016450049,https://pbs.twimg.com/media/Ci8UxxcW0AYgHDh.jpg,2,beagle,0.472324,True,Walker_hound,0.121779,True,Saint_Bernard,0.11464,True
1428,772826264096874500,https://pbs.twimg.com/media/CrmhYYIXEAEcyYY.jpg,1,basset,0.915351,True,Walker_hound,0.072416,True,beagle,0.008229,True


<a id="coleta:twitter-api"></a>

## Coleta: Twitter API

In [5]:
# Twitter APP Config
with open('twitter_config.json', 'r', encoding='utf-8') as file:
    app_config = json.load(file)

In [6]:
# Twitter API settings
api_key = app_config['api_key']
api_secret = app_config['api_secret']
access_token = app_config['access_token']
access_secret = app_config['access_secret']

In [7]:
# Connect to Twitter API
auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [8]:
json_path = 'data/tweet_json.txt'
error_log = 'data/tweet_error.log'

# Save tweets from Twitter API
if(not os.path.isfile(json_path)):
    print('Aguarde. Dado o volume de requisições, a coleta de tweets pode demorar alguns minutos.')
    tweet_count = 0
    for tweet_id in df_twitter_arc.tweet_id:
        print('{percent}%'.format(percent=int((++tweet_count/df_twitter_arc.tweet_id.size)*100)))
        try:
            status = api.get_status(tweet_id)
            with open(json_path, 'a', newline='\n') as file:
                file.write(f'{json.dumps(status._json)}\n')

        except Exception as err:
            with open(error_log, 'a', newline='\n') as log:
                log.write(f'{str(tweet_id)}: {str(err.args[0])}\n')
            print(f'{str(tweet_id)}: {str(err.args[0])}')
        
        clear_output(wait=True)
else:
    print('Dados já salvos em disco, não serão executadas novas requisições à API do Twitter.')
        

Dados já salvos em disco, não serão executadas novas requisições à API do Twitter.


In [9]:
# Convert JSON data into dict list
tweets = []

with open(json_path, 'r') as file:
    for line in file:
        try:
            tweet = json.loads(line)
            
            if(tweet.get('entities', False)):
                if(tweet['entities'].get('media', False) and tweet['entities']['media'][0].get('media_url', False)):
                    tweets.append({
                        'id': tweet['id'],
                        'created_at': tweet['created_at'],
                        'in_reply_to_status_id': tweet['in_reply_to_status_id'],
                        'in_reply_to_status_id': tweet['in_reply_to_status_id'],
                        'is_quote_status': tweet['is_quote_status'],
                        'retweet_count': tweet['retweet_count'],
                        'favorite_count': tweet['favorite_count'],
                        'media_url': tweet['entities']['media'][0]['media_url'],
                        'retweeted': tweet['retweeted'],
                        'favorited': tweet['favorited']
                    })
            
            
        except Exception as e:
            print(e) 
            
        

In [10]:
# Criar dataframe de Tweets consultados na API
columns = tweets[0].keys()
df_tweets_api = pd.DataFrame(tweets, columns = columns)

------
<a id="avaliacao"></a>

# Avaliação

<a id="avaliacao:twitter-archive"></a>

## Avaliação: Twitter data archive

In [11]:
 df_twitter_arc.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1021,746726898085036033,,,2016-06-25 15:29:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Abby. She's incredibly distracting. Just ...,,,,https://twitter.com/dog_rates/status/746726898...,12,10,Abby,,,,
1569,687807801670897665,,,2016-01-15 01:25:33 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Trooper &amp; Maya. Trooper protects Maya...,,,,https://twitter.com/dog_rates/status/687807801...,11,10,Trooper,,,,
1065,740711788199743490,,,2016-06-09 01:07:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we are witnessing the touchdown of a pupn...,,,,https://twitter.com/dog_rates/status/740711788...,9,10,,,,,
309,835536468978302976,,,2017-02-25 17:06:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Lola. Her hobbies include ...,8.352641e+17,4196984000.0,2017-02-24 23:04:14 +0000,https://www.gofundme.com/lolas-life-saving-sur...,12,10,Lola,,,,
807,771908950375665664,,,2016-09-03 03:13:29 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Doggo will persevere. 13/10\nhttps://t.co/yOVz...,,,,https://twitter.com/yahoonews/status/771905568...,13,10,,doggo,,,


In [12]:
df_twitter_arc.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [13]:
df_twitter_arc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

<a id="twitter-arc:info"></a>
* Dados incompletos, contém apenas 2356 registros dos 5000 anunciados
* `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id` e `retweeted_status_user_id` em `float`
* `timestamp` e `retweeted_status_timestamp` em `string`

In [14]:
# Verificar se há IDs duplicados
df_twitter_arc.tweet_id.duplicated().value_counts()

False    2356
Name: tweet_id, dtype: int64

<a id="twitter-arc:source"></a>

In [15]:
# Visualizar exemplos de dados da coluna `source`
df_twitter_arc.source.sample(5)

2031    <a href="http://twitter.com/download/iphone" r...
1777    <a href="http://twitter.com/download/iphone" r...
2290    <a href="http://twitter.com/download/iphone" r...
1117    <a href="http://twitter.com/download/iphone" r...
97      <a href="http://twitter.com/download/iphone" r...
Name: source, dtype: object

Variável `source` sem valor para a unidade de observação

<a id="twitter-arc:name"></a>

In [16]:
# Visualizar exemplos da coluna `name`
df_twitter_arc.name.sample(10)

2351       None
517      Hunter
1083     Bayley
75       Shadow
324      Lipton
1026     Gustav
1973     Dexter
744        None
1945    Raymond
258        Hank
Name: name, dtype: object

In [17]:
df_twitter_arc[df_twitter_arc.name == 'None'].name.count()

745

* Nomes (`name`) preenchidos com valor literal igual a `'None'`, vazio

<a id="twitter-arc:category"></id>

In [18]:
# Visualizar exemplo de dados a partir da coluna `doggo`
df_twitter_arc.loc[:,'doggo':].sample(10)

Unnamed: 0,doggo,floofer,pupper,puppo
1479,,,,
1641,,,,
1117,doggo,,,
1219,,,,
2009,,,pupper,
587,doggo,,,
1697,,,pupper,
167,,,,
231,,,,
2307,,,,


Uma variável em várias colunas

In [19]:
# Verificar padrão no preenchimento das classificações
df_twitter_arc.loc[:,'doggo':].nunique()

doggo      2
floofer    2
pupper     2
puppo      2
dtype: int64

<a id="twitter-arc:expanded_urls"></a>

Registros de `expanded_urls` com URLs inválidas e não existentes

In [20]:
# URLs sem conteúdo
df_twitter_arc.expanded_urls.sample(10)

1265    http://goo.gl/ArWZfi,https://twitter.com/dog_r...
98      https://www.gofundme.com/help-my-baby-sierra-g...
1740    https://twitter.com/dog_rates/status/679503373...
2298                                                  NaN
562                         https://vine.co/v/5FwUWjYaW0Y
903     https://twitter.com/dog_rates/status/758405701...
1447    https://twitter.com/dog_rates/status/696488710...
1444    https://twitter.com/dog_rates/status/696713835...
165     https://twitter.com/dog_rates/status/761672994...
1655    https://twitter.com/dog_rates/status/683391852...
Name: expanded_urls, dtype: object

In [21]:
df_twitter_arc.expanded_urls.isnull().value_counts()

False    2297
True       59
Name: expanded_urls, dtype: int64

------
<a id="avaliacao:image-prediction"></a>

## Avaliação: Image prediction

In [22]:
df_prediction.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [23]:
df_prediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [24]:
df_prediction.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1995,874296783580663808,https://pbs.twimg.com/media/DCIgSR0XgAANEOY.jpg,1,cocker_spaniel,0.437216,True,miniature_poodle,0.277191,True,toy_poodle,0.157402,True
1593,798694562394996736,https://pbs.twimg.com/media/Cbs3DOAXIAAp3Bd.jpg,1,Chihuahua,0.615163,True,Pembroke,0.159509,True,basenji,0.084466,True
271,670823764196741120,https://pbs.twimg.com/media/CU8-puBWwAAR8Xl.jpg,1,Labrador_retriever,0.947453,True,German_short-haired_pointer,0.017001,True,Weimaraner,0.015432,True
1155,733109485275860992,https://pbs.twimg.com/media/CiyHLocU4AI2pJu.jpg,1,golden_retriever,0.945523,True,Labrador_retriever,0.042319,True,doormat,0.003956,False
1584,797236660651966464,https://pbs.twimg.com/media/CxBafisWQAAtJ1X.jpg,2,collie,0.767005,True,Border_collie,0.100844,True,kelpie,0.048368,True
1539,790946055508652032,https://pbs.twimg.com/media/CvoBPWRWgAA4het.jpg,1,dishwasher,0.700466,False,golden_retriever,0.245773,True,chow,0.039012,True
473,675146535592706048,https://pbs.twimg.com/media/CV6aMToXIAA7kH4.jpg,1,dingo,0.288447,False,Cardigan,0.229944,True,Pembroke,0.190407,True
46,666804364988780544,https://pbs.twimg.com/media/CUD3A7YWoAA82N0.jpg,1,English_setter,0.328792,True,Brittany_spaniel,0.283545,True,Ibizan_hound,0.057461,True
212,670037189829525505,https://pbs.twimg.com/media/CUxzQ-nWIAAgJUm.jpg,1,pot,0.273767,False,tray,0.092888,False,doormat,0.050728,False
226,670361874861563904,https://pbs.twimg.com/media/CU2akCQWsAIbaOV.jpg,1,platypus,0.974075,False,spotted_salamander,0.011068,False,bison,0.003897,False


<a id="image-prediction:p-values"></a>

Valores não padronizados para as variáveis `p1`, `p2` e `p3`

In [25]:
# Visualização de exemplos de valores nas colunas `p1`, `p2` e `p3`
df_prediction.loc[:, ['p1', 'p2', 'p3']].sample(5)

Unnamed: 0,p1,p2,p3
1183,chow,gibbon,Pembroke
818,soft-coated_wheaten_terrier,cocker_spaniel,golden_retriever
1679,Samoyed,Pomeranian,West_Highland_white_terrier
959,basenji,Italian_greyhound,miniature_pinscher
2066,Irish_terrier,Irish_setter,Chesapeake_Bay_retriever


<a id="image-prediction:duplicated-urls"></a>

Verificar a duplicidade de Tweet ID (`tweet_id`) e Imagens (`jpg_url`)

In [26]:
# Verificar a duplicidade de IDs
df_prediction.tweet_id.nunique()

2075

In [27]:
# Verificar a duplicidade de Imagens analizadas
df_prediction.jpg_url.nunique()

2009

In [28]:
df_prediction.jpg_url.duplicated().value_counts()

False    2009
True       66
Name: jpg_url, dtype: int64

Há 66 imagens duplicadas

------
<a id="avaliacao:twitter-api"></a>

## Avaliação: Twitter API Requests

In [29]:
df_tweets_api.describe()

Unnamed: 0,id,in_reply_to_status_id,retweet_count,favorite_count
count,1820.0,22.0,1820.0,1820.0
mean,7.23727e+17,6.992047e+17,2512.144505,6820.872527
std,5.777841e+16,4.409222e+16,4891.076169,11900.984651
min,6.660209e+17,6.671522e+17,11.0,0.0
25%,6.747671e+17,6.747625e+17,536.0,1363.5
50%,7.008223e+17,6.799651e+17,1118.0,3153.0
75%,7.617466e+17,7.032024e+17,2526.25,7463.25
max,8.924206e+17,8.558181e+17,82905.0,163034.0


<a id="tweets-api:info"></a>

In [30]:
df_tweets_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1820 entries, 0 to 1819
Data columns (total 9 columns):
id                       1820 non-null int64
created_at               1820 non-null object
in_reply_to_status_id    22 non-null float64
is_quote_status          1820 non-null bool
retweet_count            1820 non-null int64
favorite_count           1820 non-null int64
media_url                1820 non-null object
retweeted                1820 non-null bool
favorited                1820 non-null bool
dtypes: bool(3), float64(1), int64(3), object(2)
memory usage: 90.7+ KB


In [31]:
df_tweets_api.sample(5)

Unnamed: 0,id,created_at,in_reply_to_status_id,is_quote_status,retweet_count,favorite_count,media_url,retweeted,favorited
1450,672970152493887488,Sat Dec 05 02:46:02 +0000 2015,,False,365,956,http://pbs.twimg.com/media/CVbeyGUU8AEq300.jpg,False,False
463,760893934457552897,Wed Aug 03 17:43:45 +0000 2016,,False,1040,4017,http://pbs.twimg.com/media/Co88_ujWEAErCg7.jpg,False,False
1323,675781562965868544,Sat Dec 12 20:57:34 +0000 2015,,False,492,1661,http://pbs.twimg.com/media/CWDbv2yU4AARfeH.jpg,False,False
1054,689154315265683456,Mon Jan 18 18:36:07 +0000 2016,,False,1058,3190,http://pbs.twimg.com/media/CZBeMMVUwAEdVqI.jpg,False,False
608,743222593470234624,Wed Jun 15 23:24:09 +0000 2016,,False,2012,6446,http://pbs.twimg.com/media/ClB09z0WYAAA1jz.jpg,False,False


* `created_at` como `string`
* `in_reply_to_status_id` como `Float`

<a id="tweets-api:duplicated"></a>
Verificar a duplicidade de valores

In [32]:
# Verificar se há IDs duplicados
df_tweets_api.id.duplicated().value_counts()

False    1820
Name: id, dtype: int64

In [33]:
# Verificar se há imagens duplicadas
df_tweets_api.media_url.duplicated().value_counts()

False    1759
True       61
Name: media_url, dtype: int64

Há tweets referenciando a mesma imagem

------
<a id="assses-notes"></a>

## Notas de avaliação
### Qualidade

#### `df_twitter_arc` - Twitter archive
1. [Base incompleta, contém apenas **2356** registros dos **5000** anunciados](#twitter-arc:info)
* [Coluna `source` não acrescenta valor à unidade de obsevação](#twitter-arc:source)
* [Nomes (`name`) preenchidos com valor literal igual a `'None'`, vazio](#twitter-arc:name)
* [Coluna `expanded_urls` com linhas não preenchidas e sem valor para a unidade em observação](#twitter-arc:expanded_urls)


#### `df_prediction` - Image prediction
1. [Previsões [`p1`, `p2`, `p3`] com nomes não padronizados](#image-prediction:p-values)
* [Previsões repetidas sobre a mesma imagem](#image-prediction:duplicated-urls)


#### `df_tweets_api` - Remaining data from Tweets from Twitter API
1. [Data de criação (`created_at`) como `string`](#tweets-api:info)
* [Valor do id na coluna `in_reply_to_status_id` como `float`](#tweets-api:info)
* [Diferentes Tweets referenciando a mesma imagem, duplicidade](#tweets-api:duplicated)


### Organização

#### `df_twitter_arc` - Twitter archive
1. [As colunas `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` estão em `float64`](#twitter-arc:info)
* [As colunas `timestamp` e `retweet_status_timestamp` estão em `string`](#twitter-arc:info)
* [Uma variável em várias colunas, `doggo`, `floofer`, `pupper` e `puppo`](#twitter-arc:category)


#### `df_prediction` - Image prediction
-- *Nada a declarar* --


#### `df_tweets_api` - Remaining data from Tweets from Twitter API
1. [As colunas `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` estão em `float64`](#tweets-api:info)

------
<a id="limpeza"></a>

# Limpeza

<a id="assses-notes"></a>

## Limpeza: Definição 

#### `df_twitter_arc` - Twitter archive
1. [Base incompleta, de **5000**, apenas **2356** estão disponíveis](#twitter-arc:info)
    * OK - Nada a ser feito
* [Coluna `source` apresenta tags HTML](#twitter-arc:source)
* [Coluna `source` não acrescenta valor à unidade de observação](#twitter-arc:source)
    * As tags HTML poderiam ser removidas com a utilização de expressões regulares. O conteúdo apresenta um link para download do App Twitter, portanto não agrega valor à unidade de observação, a coluna pode ser removida
* [Nomes (`name`) não preenchidos com valor literal igual a `'None'`](#twitter-arc:name)
    * Aplicar valor '' para nomes iguais a 'None'
* [Coluna `expanded_urls` com linhas não preenchidas](#twitter-arc:expanded_urls)
* [Endereços inválidos em `expanded_urls`](#twitter-arc:expanded_urls)
    * Alguns endereços retornaram erro 404 e outros não preenchidos. Utilizar apenas tweets com referências válidas na API (`df_tweets_api`)
* [As colunas `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` estão em `float64`](#twitter-arc:info)
    * A coluna, `retweeted_status_id` ou `retweeted_status_user_id`, podem ser convertidas para `Boolean` como auxílio a identificação de tweets originais
    * Após seleção de tweets originais, as colunas de retweet, `retweeted_status_id`, `retweeted_status_user_id`, `retweeted_status_timestamp`, podem ser descartadas
    * As colunas `in_reply_to_status_id` e `in_reply_to_user_id` serão retratadas no dataframe `df_tweets_api`
* [As colunas `timestamp` e `retweet_status_timestamp` estão em `string` e deveriam ser do tipo `datetime`](#twitter-arc:info)
    * Converter valores para `datetime`
* [Uma variável em várias colunas, `doggo`, `floofer`, `pupper` e `puppo`](#twitter-arc:category)
    * Reduzir a variável para uma coluna, `category`


#### `df_prediction` - Image prediction
1. [Previsões [`p1`, `p2`, `p3`] com nomes não padronizados
    * Converter os valores para 'lower case' e substituir '_' por espaços
* [Previsões repetidas sobre a mesma imagem](#image-prediction:duplicated-urls)
    * Descartar previsões repetidas


#### `df_tweets_api` - Remaining data from Tweets from Twitter API
1. [Data de criação (`created_at`) como `string`](#tweets-api:info)
    * Converter datas em string para o formato `datetime`
* [Valor do id na coluna `in_reply_to_status_id` e `in_reply_to_user_id` como `float`](#tweets-api:info)
    * Converter para Inteiro as colunas `in_reply_to_status_id` e `in_reply_to_user_id` e atribuir 0 as que estiverem sem valor


## Limpeza: Twitter archive

In [34]:
# Criar cópia do dataframe para a limpeza
df_arch_clean = df_twitter_arc.copy()

2. [Coluna `source` apresenta tags HTML](#twitter-arc:source)
* [Coluna `source` não acrescenta valor à unidade de observação](#twitter-arc:source)
    * As tags HTML poderiam ser removidas com a utilização de expressões regulares. O conteúdo apresenta um link para download do App Twitter e sem valor à unidade de observação, a coluna pode ser removida

In [35]:
# Remoção da coluna `source`
df_arch_clean.drop('source', axis='columns', inplace=True)

#### Teste: Remoção coluna `source`

In [36]:
# Conferir que a coluna `source` foi removida
df_arch_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

4. [Nomes (`name`) não preenchidos com valor literal igual a `'None'`](#twitter-arc:name)
    * Aplicar valor '' para nomes iguais a 'None'

In [37]:
df_arch_clean[df_arch_clean.name == 'None'].name.count()

745

In [38]:
# Aplicar '' aos nomes iguais a 'None'
df_arch_clean.name = df_arch_clean.name.apply(lambda name: '' if name.lower() == 'none' else name)

#### Teste: Nomes não preenchidos

In [50]:
df_arch_clean[df_arch_clean.name == ''].name.count()

745

In [40]:
df_arch_clean.name.sample(10)

683        Dexter
1321             
86          Goose
1007    Bookstore
877        Wishes
340         Logan
810          Fizz
362              
302              
729       Chipson
Name: name, dtype: object

5. [Coluna `expanded_urls` com linhas não preenchidas](#twitter-arc:expanded_urls)
* [Endereços inválidos em `expanded_urls`](#twitter-arc:expanded_urls)
    * Alguns endereços retornaram erro 404 e outros não preenchidos. A informação reflete a URL do tweet original, sem valor para o objeto de avaliação. (`df_tweets_api`)

In [41]:
df_arch_clean.drop('expanded_urls', axis='columns', inplace=True)

#### Teste: Remoção da coluna `expanded_urls`

7. [As colunas `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id` estão em `float64`](#twitter-arc:info)
    * A coluna, `retweeted_status_id` ou `retweeted_status_user_id`, podem ser convertidas para `Boolean` como auxílio a identificação de tweets originais
    * Após seleção de tweets originais, as colunas de retweet, `retweeted_status_id`, `retweeted_status_user_id`, `retweeted_status_timestamp`, podem ser descartadas
    * As colunas `in_reply_to_status_id` e `in_reply_to_user_id` serão retratadas no dataframe `df_tweets_api`

In [82]:
# Identificação dos tweets originais
df_arch_clean.loc[:,'retweeted_status_id'].isnull().sum()

2175

In [80]:
# Remoção de retweets
df_arch_clean.drop(df_arch_clean[df_arch_clean['retweeted_status_id'].notnull()].index, inplace=True)

In [85]:
df_arch_clean.drop(columns=['retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], inplace=True)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,This is Phineas. He's a mystical boy. Only eve...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,This is Tilly. She's just checking pup on you....,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,This is Darla. She commenced a snooze mid meal...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,This is Franklin. He would like you to stop ca...,12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,Here we have a majestic great white breaching ...,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,Meet Jax. He enjoys ice cream so much he gets ...,13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,When you watch your owner call another dog a g...,13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,This is Zoey. She doesn't want to be one of th...,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,This is Cassie. She is a college pup. Studying...,14,10,Cassie,doggo,,,


## Limpeza: Image prediction 

# Armazenamento

# Relatórios

* Data wrangling efforts
* Analyses and visualizations