# Documentation for data wrangling steps: gather, assess, and clean.

##  1. Gathering data:
- [Import libraries](#headin)
- [Upload and read the original file](#id2)
- [Import tweet image predictions (a TSV file stored on Udacity servers](#id3)
- [Gather additional data via Twitter API](#id4)

### 1.1. Importing libraries<a name="headin"></a>

In [None]:
import pandas as pd
import numpy as np
import requests
import tweepy
import json

### 1.2. Uploading and reading the original file<a name="id2"></a>

In [None]:
df = pd.read_csv('twitter-archive-enhanced.csv')
df.info()
df = df_upl.copy()

In [None]:
df = pd.read_csv('twitter-archive-enhanced.csv')
df.info()
df = df_upl.copy()

In [None]:
#getting a list of tweet ids to use with API later down the line
tweet_ids = df.tweet_id.tolist()
tweet_ids [:5]

### 1.3. Importing tweet image predictions <a name="id3"></a>

In [None]:
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv" 
req = requests.get(url)
url_content = req.content
tsv_file = open('image_predictions.tsv', 'wb')
tsv_file.write(url_content)
tsv_file.close()

In [None]:
tsv_upload = pd.read_csv('image_predictions.tsv', sep='\t')
tsv_upload.head()
tsv = tsv_upload.copy()

### 1.3. Gathering additional data via Twitter API <a name="id4"></a>


In [None]:
#create an API object to gather Twitter data
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
token_secret = 'HIDDEN'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, token_secret)
api = tweepy.API(auth, parser = tweepy.parsers.JSONParser(), wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
#testing credentials
try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")
    
#Authentication OK

In [None]:
index = 0
error_dict = {}
result=[]
import json

with open('tweet_json.txt', 'w') as f:
    for tw in tweet_ids:
        index +=1
        print(str(index) + ':' + str(tw))
        try:
            result = api.get_status(tw, tweet_mode = 'extended')
            print('Success')
            f.write(json.dumps(result) + '\n')
        except tweepy.TweepError as e:
            print('fail')
            error_dict[tw] = e
            pass
print(error_dict)

##  2. Assessing data:

2.1. Read json.txt. file:
   
     with open('tweet_json.txt', 'rb') as file:
        data = file.readlines()
        
2.2. Load into the dataframe:

    with open('tweet_json.txt', encoding="utf8") as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]
    dftw = pd.DataFrame(data)
    
2.3. Display all columns in the dataframe:
 
    from IPython.display import display
    pd.options.display.max_columns = None
    display(dftw)

2.4. Get basic info about the data:

    dftw.info()
    dftw.describe()
    display(dftw.user[0])
    dftw.id.head()

2.5. Check for duplicates:

    !pip install hashable_df
    from hashable_df import hashable_df
    hashable_df(df2).duplicated().sum()
    
2.6. Check for null values

    df2.isnull().sum().sum()
    
2.7. Show a sample of NaN rows

    df2.loc[df.isnull().any(axis=1)].head()
    
2.8. Confirm that the # of tweets pulled via API matches the total from the original file:

    api_tweets = df2.tweet_id.tolist()
    matching_tweets = set(tweet_ids).intersection(api_tweets)
    len(matching_tweets)

##  3. Cleaning data

### Tidiness -- Define:

1. Dog rating variable is split in two columns: combine the two to make a unified column
2. Irrelevant data is obstructing the view: drop unneeded columns and rows
3. Relevant data is stored in separate tables: combine into one

### Tidiness -- Code & Test:

#Make a copy of the dataset first:
        
    df2=dftw.copy()
    
#Dog Rating numerator & denominator need to be combined into a single column

    whole['rating_numerator'] = whole['rating_numerator'].astype(str)
    whole['rating_denominator'] = whole['rating_denominator'].astype(str)
    whole['rating'] = whole['rating_numerator'].str.cat(whole['rating_denominator'],sep="/")
    whole.drop(['rating_denominator','rating_numerator'],axis=1,inplace=True)
    whole = whole.reindex(columns = whole.columns)

#Drop some unneeded columns, example below:

    df2.drop(['id_str',
    'in_reply_to_status_id_str',
    'in_reply_to_user_id_str',
    'quoted_status_id_str','place'],axis=1,inplace=True)
    
#Add prefix and merge the datasets:

    df2 = df2.add_prefix('api_')
    tsv = tsv.add_prefix('img_')
    df_new = df.merge(df2,on=['tweet_id','source'], how='left')
    whole = df_new.merge(tsv,on=['tweet_id'], how='left')

#Inspect columns in the new dataset:

    whole.columns.sort_values()
    whole.sample()
    
#Return only rows containing images, and then reducing to those that have their API additional info

    whole=whole.query("img_jpg_url == img_jpg_url")
    whole=whole.query("api_favorited==api_favorited")

### Quality -- Define:

- "Timestamp" column variables are stored as string and should be converted to date/time
- 'id' needs to be renamed as 'tweet_id'
- 'api_source' needs to be renamed as 'source'
- "API_Entitiies" columns contains a series of dictionatiries (variables) that need to be split into separate columns
- Dog ratings are stored as string and need to be converted to float, so we can query them later
- Erroneous data need to be fixed, e.g. dog ratings should be over 1 (12/10, 11/10), those that are less than 1 (9/10, 8/10) are incorrect as they do not meet the standard of WeRateDogs
- "api_retweet_count" and "api_favorite_count" column variables are stored as floats and need to be converted to integers
- "api_user" has multiple variables stored in one column: split into multiple those that are needed and drop the rest

### Quality -- Code & Test:

#Timestamp is an object --> need to be date/time

    whole['timestamp'] = pd.to_datetime(whole['timestamp'])
    
    
#Performing column renaming, example below:

    df2.rename(columns={'id':'tweet_id'},inplace=True)
    df2.rename(columns={'api_source':'source'},inplace=True)


#Splitting API_Entitiies series of dictionatiries into separate columns; pulling hashtag values into a column.

    whole = pd.concat([whole.drop(['api_entities'], axis=1),
    whole['api_entities'].apply(pd.Series)], axis=1)

    whole[['hashtag','indices']] = pd.DataFrame(whole.hashtags.tolist(), 
    index= whole.index)
    whole.drop(['hashtags','indices'],axis=1,inplace=True)

    whole = pd.concat([whole.drop(['hashtag'], axis=1), 
    whole['hashtag'].apply(pd.Series)], axis=1)

#Fixing dog ratings

    #how many ratings cotain the "/10" denominator?
    whole[whole['rating'].str.contains("/10")].tweet_id.count()
    #some of thsoe are still inaccurate, e.g. 8/10, 2/10

    #converting string ratings to floats so we can query them later
    whole['float_rating'] = whole.rating.fillna(1000).apply(pd.eval)

    incorrect_ratings = whole.query('float_rating < 1')

    whole = whole[whole.float_rating > 1]
    
    
#Converting count variables into integers:

    whole.api_retweet_count=whole.api_retweet_count.astype(int)
    whole.api_favorite_count=whole.api_favorite_count.astype(int)


#Extracting followers' count from api_user into a separate column

    followers_count = [d.get('followers_count') for d in user_values]

    unique_list = (list(set(followers_count)))
    for x in unique_list: 
            print (x)

    whole['followers_count'] = pd.DataFrame([x for x 
    in whole['api_user']])['followers_count']
    

#Extracting statuses count and frineds count value into separate columns as well:

    whole['statuses_count'] = pd.DataFrame([x for x in whole['api_user']])['statuses_count']
    whole['friends_count'] = pd.DataFrame([x for x in whole['api_user']])['friends_count']


