# Data extraction from imported JSON file from Twitter
***

In [1]:
# import required libraries
import json
import numpy as np
import pandas as pd

## Create TwitterJSON manager class
***

In [9]:
# class definition 
class TwitterJsonMgr():
    def __init__(self, filepath):
        self._filepath = filepath
        self.__StoreTweetDataToList()
    
    # define methods
    def __StoreTweetDataToList(self):
        self._tweetdata = []
        for line in open(self._filepath):
            try:
                self._tweetdata.append(json.loads(line))
            except:
                continue
    
    def __ExtractText(self):
        texts = []
        
        for tweet in self._tweetdata:
            try:
                texts.append(tweet['text'])
            except:
                pass
        return texts
    
    # define properties
    def SetFilePath(self, filepath):
        self._filepath = filepath
    
    def GetFilePath(self):
        return self._filepath
       
    def GetTweetData(self):
        return self._tweetdata
    
    def GetTweetCount(self):        
        return len(self._tweetdata)
    
    def GetTexts(self):
        return self.__ExtractText()

In [3]:
# declare variables
# path = "C://Dropbox/research-and-dissertation/final-dissertation/code/tweetdata/"
path = "C://Dropbox/research-and-dissertation/final-dissertation/code/"
jsonfile_fakeclaim_twts = path+"clean_fake_twts.json" #"fake_claim_tweets.json"
jsonfile_realclaim_twts = path+"clean_real_twts.json" #"real_claim_tweets.json"

# instantiate JsonMgr object for fake claim tweets
jsonmgr_fakeclaim_twts = TwitterJsonMgr(jsonfile_fakeclaim_twts)

# instantiate JsonMgr object for real claim tweets
jsonmgr_realclaim_twts = TwitterJsonMgr(jsonfile_realclaim_twts)

## Check the data stats
***

In [4]:
print("Total Fake Claims:", +jsonmgr_fakeclaim_twts.GetTweetCount())
print("Total Real Claims:", +jsonmgr_realclaim_twts.GetTweetCount())

Total Fake Claims: 6618
Total Real Claims: 157362


## Write functions to extract metadata out of tweets collected
***

In [5]:
def GetHashTags(tweet):
    if (tweet['entities']['hashtags'] != []):
        hashCnt = len(tweet['entities']['hashtags'])
        hashTags = ""
        for i in range(hashCnt):            
            if i>0:
                hashTags = hashTags + ", " +tweet['entities']['hashtags'][i]['text']
            else:
                hashTags = tweet['entities']['hashtags'][i]['text']
        return hashTags
    
def GetUserMention(tweet):    
    if (tweet['entities']['user_mentions'] != []):
        usrMentionCnt = len(tweet['entities']['user_mentions'])
        usrMentions = ""
        for i in range(usrMentionCnt):            
            if i>0:
                usrMentions = usrMentions + ", " +tweet['entities']['user_mentions'][i]['screen_name']
            else:
                usrMentions = tweet['entities']['user_mentions'][0]['screen_name']
        return usrMentions
    
def GetURLs(tweet):
    if (tweet['entities']['urls'] != []):
        twtUrlsCnt = len(tweet['entities']['urls'])
        twtUrls = ""
        for i in range(twtUrlsCnt):            
            if i>0:
                twtUrls = twtUrls + ", " +tweet['entities']['urls'][i]['expanded_url']
            else:
                twtUrls = tweet['entities']['urls'][0]['expanded_url']
        return twtUrls    

def GetURLsCnt(tweet):
    if (tweet['entities']['urls'] != []):
        twtUrlsCnt = len(tweet['entities']['urls'])
        twtUrls = ""
        for i in range(twtUrlsCnt):            
            if i>0:
                twtUrls = twtUrls + ", " +tweet['entities']['urls'][i]['expanded_url']
            else:
                twtUrls = tweet['entities']['urls'][0]['expanded_url']
        return twtUrls    
    
from datetime import datetime
def GetUsrAccAgeInDays(tweet, refDate=datetime.now()):
    usrCreatedDt = tweet['user']['created_at']
    
    # convert both datetime to same format before calculating the difference
    dt_acc_creation = datetime.strftime(datetime.strptime(usrCreatedDt,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
    dt_refDate = datetime.strftime(datetime.now() , '%Y-%m-%d %H:%M:%S')
    
    # calculate the difference
    dt_format = '%Y-%m-%d %H:%M:%S'
    timedelta = (datetime.strptime(dt_refDate,dt_format) - datetime.strptime(dt_acc_creation,dt_format))
    
    # return days from the time delta object
    return timedelta.days
    
def RetrieveTwtrMetaData(jsonMgrObj, label):
    # fetch tweet data from the fake claim json manager object
    tweets = jsonMgrObj.GetTweetData()

    # populate data frame with the required metadata
    tweet_df = pd.DataFrame()

    for tweet in tweets:
        # create temp variables to store the information from the tweet     
        twtId = tweet['id']
        twtTxt = tweet['text']
        retwtCnt = tweet['retweet_count']
        hashTagsCnt = len(tweet['entities']['hashtags'])
        hashTags = GetHashTags(tweet)
        usrMentionsCnt = len(tweet['entities']['user_mentions'])
        usrMentions = GetUserMention(tweet)
        twtUrlCnt = len(tweet['entities']['urls'])
        
        usrName = tweet['user']['screen_name']
        usrAccAge = GetUsrAccAgeInDays(tweet)
        usrTwtCnt = tweet['user']['statuses_count']
        usrVrfd = tweet['user']['verified']
        usrLoc = tweet['user']['location']        

        # assign values to the temporary dataframe which will be passed to the master dataframe
        temp_df = pd.DataFrame({
                                    'twt-id':[twtId],
                                    'twt-txt':[twtTxt],
                                    'retwt-cnt':[retwtCnt],
                                    'twt-hashtags-cnt': [hashTagsCnt],
                                    'twt-hashtags':[hashTags],
                                    'usr-mention-cnt':[usrMentionsCnt],
                                    'usr-mention': [usrMentions],
                                    'twt-url-cnt': [twtUrlCnt],            
                                    'usr-name':[usrName],
                                    'usr-acc-age':[usrAccAge],
                                    'usr-twtcnt':[usrTwtCnt],
                                    'usr-vrfd':[usrVrfd],
                                    'usr-loc':[usrLoc],
                                    'label': [label]            
                               })

        # update master data frame
        tweet_df = tweet_df.append(temp_df, ignore_index = True)
    
    return tweet_df

In [7]:
def RetrieveTwtrMetaData_01(jsonMgrObj, label):
    # fetch tweet data from the fake claim json manager object
    tweets = jsonMgrObj.GetTweetData()

    # populate data frame with the required metadata
    tweet_df = pd.DataFrame()
    
    i = 0
    for tweet in tweets:
        # create temp variables to store the information from the tweet     
        twtId = tweet['id']
        twtTxt = tweet['text']
        retwtCnt = tweet['retweet_count']
        hashTagsCnt = len(tweet['entities']['hashtags'])
        hashTags = GetHashTags(tweet)
        usrMentionsCnt = len(tweet['entities']['user_mentions'])
        usrMentions = GetUserMention(tweet)
        twtUrlCnt = len(tweet['entities']['urls'])
        
        usrName = tweet['user']['screen_name']
        usrAccAge = GetUsrAccAgeInDays(tweet)
        usrTwtCnt = tweet['user']['statuses_count']
        usrVrfd = tweet['user']['verified']
        usrLoc = tweet['user']['location']        

        # assign values to the temporary dataframe which will be passed to the master dataframe
        temp_df = pd.DataFrame({
                                    'twt-id':[twtId],
                                    'twt-txt':[twtTxt],
                                    'retwt-cnt':[retwtCnt],
                                    'twt-hashtags-cnt': [hashTagsCnt],
                                    'twt-hashtags':[hashTags],
                                    'usr-mention-cnt':[usrMentionsCnt],
                                    'usr-mention': [usrMentions],
                                    'twt-url-cnt': [twtUrlCnt],            
                                    'usr-name':[usrName],
                                    'usr-acc-age':[usrAccAge],
                                    'usr-twtcnt':[usrTwtCnt],
                                    'usr-vrfd':[usrVrfd],
                                    'usr-loc':[usrLoc],
                                    'label': [label]            
                               })
        i = i + 1
        print(i)
        # update master data frame
        tweet_df = tweet_df.append(temp_df, ignore_index = True)
    
    return tweet_df

### Retrieve metadata for both fake and real claims and label them
***

In [8]:
# Fake dataframe
fake_df = RetrieveTwtrMetaData_01(jsonmgr_fakeclaim_twts,"fake")

# Real dataframe
real_df = RetrieveTwtrMetaData_01(jsonmgr_realclaim_twts, "real")

NameError: name 'jsonmgr_fakeclaim_twts' is not defined

In [None]:
#  UNCOMMENT CODE FOR SAVING THE DATA TO OUTPUT FILE
# output_path = "C://GoogleDrive/dissertation/data/output/"
# real_df.to_csv(output_path+"real_tweets.csv", index=False)
# fake_df.to_csv(output_path+"fake_tweets.csv", index=False)

## Read pre-processed meta-data extracted files 
***

In [10]:
fake_df = pd.read_csv("C://GoogleDrive/dissertation/data/output/fake_tweets.csv")
real_df = pd.read_csv("C://GoogleDrive/dissertation/data/output/real_tweets.csv")

In [11]:
fake_df.shape, real_df.shape

((6618, 14), (157362, 14))

In [12]:
df = pd.concat([real_df, fake_df])
df.shape

(163980, 14)

In [14]:
output_path = "C://GoogleDrive/dissertation/data/output/"

# Save the consolidated tweets to a file for further processing
df.to_csv(output_path+"all_tweets.csv", index=False)