In [1]:
import os
import tweepy as tw
import pandas as pd
import numpy as np
from pathlib import Path
import jsonlines
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import textblob
import requests
import re
import math
import textblob

In [5]:
consumer_key= 'XXX'
consumer_secret= 'XXX'
access_token= 'XXX'
access_token_secret= 'XXX'
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

## Random functions

In [None]:
def sortDictByKey(dict, ascending = True):
    sorted_dict = {}
    sorted_keys = sorted(dict.keys(), reverse=not ascending)
    for i in sorted_keys:
        sorted_dict[i] = dict[i]
    return sorted_dict

## Hydration

In [None]:
# gets the names of text files to be hydrated
def getTextFileNames(reverse=False):
    file_names = []
    for path in Path().iterdir():
        if path.name.endswith('.txt'):
            file_names.append(path.name)
    file_names.sort(reverse=reverse)
    return file_names

In [None]:
# takes Twitter account tokens as input and returns API object
def getTwitterAPIObject(consumer_key, consumer_secret, access_token, access_token_secret):
    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api_keys = tw.API(auth, wait_on_rate_limit=True)
    return api_keys

In [None]:
# takes a list of Tweet IDs and hydrates them
def hydrateListOfIDs(api_keys, ids, output_name, traceRootTweet=False):
    # hydrate Tweets in groups of 100
    exit = False # exit loop if error occurs
    with jsonlines.open(output_name, mode='w') as writer:
        for i in range(len(ids)//100+1):
            ids_set = ids[100*i:100+100*i] # for regular sets
            if i == len(ids)//100: # for the last set
                ids_set = ids[100*i:len(ids)]
            if (len(ids_set) == 0):
                continue
            try: 
                statuses = api_keys.statuses_lookup(ids_set, tweet_mode="extended")
            except:
                print("Error occured")
                Path(output_name).unlink()
                print("Deleted:",output_name)
                exit = True
                break
            # write to jsonl file
            if (traceRootTweet):
                for status in statuses:
                    try:
                        OG_id = status._json['retweeted_status']['id']
                        status = api_keys.statuses_lookup([OG_id], tweet_mode="extended")[0]
                    except:
                        pass
                    writer.write(status._json)
            else:
                for status in statuses:
                    writer.write(status._json)

In [None]:
# takes a list of text files containing Tweet IDs and hydrates them
def hydrateListOfFiles(api_keys,api_name, file_names):
    # track hydration success
    try:
        stats_file = jsonlines.open(api_name + " Hydration Success.jsonl",)
        for line in stats_file:
            hydration_success = line
    except:
        hydration_success = {}
    # hydrate each file
    for file_name in file_names:
        time_start = time.perf_counter()
        file = open(file_name, "r")
        output_name = file_name[:-4] + ".jsonl"
        if Path(output_name).is_file():
            continue
        print(file_name)
        # read in Tweet IDs
        ids = []
        for line in file:
            ids.append(int(line))
        # hydrate Tweets in groups of 100
        count = 0
        exit = False # triggers to True if error occurs
        with jsonlines.open(output_name, mode='w') as writer:
            for i in range(len(ids)//100+1):
                ids_set = ids[100*i:100+100*i] # for regular sets
                if i == len(ids)//100: # for the last set
                    ids_set = ids[100*i:len(ids)]
                if (len(ids_set) == 0):
                    continue
                try: 
                    statuses = api_keys.statuses_lookup(ids_set, tweet_mode="extended")
                except:
                    print("Timed out")
                    Path(output_name).unlink()
                    print("Deleted:",output_name)
                    exit = True
                    break
                # write to jsonl file
                for status in statuses:
                    count += 1
                    writer.write(status._json)
        time_end = time.perf_counter()
        duration = str(round((time_end-time_start)/60,2)) + " mins"
        hydration_success[file_name] = [count, len(ids), duration]
        hydration_success = sortDictByKey(hydration_success)
        if exit:
            with jsonlines.open(api_name + " Hydration Success.jsonl", mode='w') as writer:
                writer.write(hydration_success)
            break
    with jsonlines.open(api_name + " Hydration Success.jsonl", mode='w') as writer:
        writer.write(hydration_success)

In [None]:
# takes a list of JSONL dictionaries and combines them into one dictionary
def combineJsonlDicts(jsonl_names_list):
    dicts = []
    for i in range(len(jsonl_names_list)):
        dicts.append(jsonlines.open(jsonl_names_list[i],))
    for i in range(len(dicts)):
        for line in dicts[i]:
            if (i == 0):
                new_dict = line
            else:
                new_dict.update(line)
    new_dict = sortDictByKey(new_dict)
    return new_dict

## Functions on individual tweets

In [2]:
# gets the full text of a Tweet
def getOriginalText(status):
    try:
        return status['retweeted_status']['full_text']
    except:
        try:
            return status['full_text']
        except:
            return status['text']
# checks to see if a Tweet is a retweet
def isRetweet(status):
    try:
        test = status['retweeted_status']
        return True
    except:
        return False
# returns the status that is being replied to, if it exists
def getReplyID(status):
    if (status['in_reply_to_status_id'] == None):
        return 9999999999999999999
    else:
        return status['in_reply_to_status_id']
# returns the user that is being replied to, if they exist
def getReplyUser(status):
    if (status['in_reply_to_user_id'] == None):
        return 9999999999999999999
    else:
        return status['in_reply_to_user_id']
# gets the Tweet ID of the status being retweeted    
def getOriginalTweetID(status):
    try:
        return status['retweeted_status']['id']
    except: 
        return 9999999999999999999
# formats datetime object
def formatDateTime(dtime):
    return datetime.strftime(datetime.strptime(dtime,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
# gets datetime object of original tweet
def getOGDateTime(status):
    try:
        return formatDateTime(status['retweeted_status']['created_at'])
    except:
        return None
# gets the sensitivity level of a status
def getSensitivity(status):
    try:
        return s['possibly_sensitive']
    except:
        return None

In [3]:
# testing
def getTweetInfo(s):
    tweetID = s['id']
    date = formatDateTime(s['created_at'])
    text = getOriginalText(s)
    isRT = isRetweet(s)
    numRetweets = s['retweet_count']
    numFavorites = s['favorite_count']
    user_name = s['user']['screen_name']
    user_date = s['user']['created_at']
    numFollowers = s['user']['followers_count']
    numFriends = s['user']['friends_count']
    verified = s['user']['verified']
    location = s['user']['location']
    htags = s['entities']['hashtags']
    mentions = s['entities']['user_mentions']
    URLs = s['entities']['urls']
    try:
        media = s['entities']['media']
    except: 
        media = None
    replyID = getReplyID(s)
    replyUser = getReplyUser(s)
    replyName = s['in_reply_to_screen_name']
    sens = getSensitivity(s)
    lang = s['lang']
    OGtweetID = getOriginalTweetID(s)
    OGdate = getOGDateTime(s)
    return [tweetID, date, text, isRT, numRetweets, numFavorites, user_name, user_date, numFollowers, numFriends, verified, location, htags, mentions, URLs, media, replyID, replyUser, replyName, sens, lang, OGtweetID, OGdate]

## Parsing JSONLs into DFs, Pickles

In [1]:
# get JSONL file names corresponding to a specific day
def getFileNames(day, month):
    start = 'coronavirus-tweet-id-2021-' + "%02d" % month + '-' "%02d" % day + '-00'
    end = 'coronavirus-tweet-id-2021-' + "%02d" % month + '-' "%02d" % (day+1) + '-00'
    file_names = []
    for path in Path().iterdir():
        if (path.name.startswith('corona') and path.name >= start and path.name < end and path.name.endswith('.jsonl')):
            file_names.append(path.name)
    file_names.sort()
    return file_names

In [5]:
# takes a list of JSONL files and returns a list with information about the statuses
def jsonlToList(file_names, suppress=True):
    time_start = time.perf_counter()
    data = []
    for file_name in file_names:
        if (suppress == False):
            print(file_name)
        f = jsonlines.open(file_name,)
        for status in f.iter():
            tweetInfo = getTweetInfo(status)
            data.append(tweetInfo)
    time_end = time.perf_counter()
    duration = str(round((time_end-time_start)/60,2)) + " mins"
    if (suppress == False):
        print("Read time:",duration)
    return data

In [7]:
# converts the list into a dataframe
def listToDF(data, suppress=True, truth_labels=False):
    time_start = time.perf_counter()
    if truth_labels:
        df = pd.DataFrame(data, columns = ['veracity', 'index','tweetID', 'date', 'text', 'isRT', '#Retweets','#Favorites', 'user_name', 'user_date', '#Followers', '#Friends', 'verified', 'location', 'hashtags', 'mentions','URLs', 'media', 'replyID', 'replyUser', 'replyName','sens', 'lang', 'OGtweetID', 'OGdate'])
    else:
        df = pd.DataFrame(data, columns = ['tweetID', 'date', 'text', 'isRT', '#Retweets','#Favorites', 'user_name', 'user_date', '#Followers', '#Friends', 'verified', 'location', 'hashtags', 'mentions','URLs', 'media', 'replyID', 'replyUser', 'replyName','sens', 'lang', 'OGtweetID', 'OGdate'])
    time_end = time.perf_counter()
    duration = str(round((time_end-time_start)/60,2)) + " mins"
    if (suppress == False):
        print("DF creation time:",duration)
    return df

In [None]:
# exports a dataframe to a pickle
def exportDFToPickle(day, month, df, suppress=True):
    time_start = time.perf_counter()
    pklname = "%02d" % month + '_' + "%02d" % day + '.pkl'
    df.to_pickle(pklname)
    time_end = time.perf_counter()
    duration = str(round((time_end-time_start)/60,2)) + " mins"
    if (suppress == False):
        print("Pickle export time:",duration)

In [None]:
# shortcut function to convert JSONLs directly to pickles
def jsonlToPickle(startDay, endDay, month):
    for day in range(startDay,endDay):
        file_names = getFileNames(day, month)
        data = jsonlToList(file_names, suppress=False)
        df = listToDF(data, suppress=False)
        exportDFToPickle(day, month, df, False)
        del data, df

In [2]:
# import pickle into DF
def importPickle(pkl_name, suppress=True):
    time_start = time.perf_counter()
    df = pd.read_pickle(pkl_name)
    time_end = time.perf_counter()
    duration = str(round((time_end-time_start)/60,2)) + " mins"
    if (suppress == False):
        print("Pickle import time:", duration)
    return df

## Working with Fact Checking Websites

In [None]:
# hydrates news links into JSONLs
def newsLinksToJsonls(file_name, sheetname):
    df = pd.read_excel(file_name, sheetname)
    for i in range(len(df)):
        q = df['Link'][i]
        statuses = api.search_30_day(query=q, label='30dayenv')
        output_name = file_name[:-5] + "_" + sheetname + "_" + "%02d" % i + '.jsonl'
        with jsonlines.open(output_name, mode='w') as writer:
            for status in statuses:
                writer.write(status._json)

In [None]:
# gets the Tweet root of a given erumor
def getRumorRoots(df):
    roots = []
    for i in range(len(df)):
        rootID = df['replyID'][i]
        if (rootID == 9999999999999999999):
            continue
        roots.append(int(rootID))
    roots.sort()
    return roots

In [None]:
# gets the index of the root Tweet
def getRootIndex(rootsDict, rootID):
    for i in rootsDict.keys():
        if (rootID in rootsDict[i]):
            return i

In [None]:
# gets rumor roots and replies
def getRumorRootsAndReplies(df):
    rootsDict = {}
    for i in range(len(df)):
        rootID = df['replyID'][i]
        if (rootID == 9999999999999999999):
            continue
        if (rootsDict.get(rootID) == None):
            rootsDict[rootID] = [df['tweetID'][i]]
        else:
            replies = rootsDict.get(rootID)
            replies.append(df['tweetID'][i])
            rootsDict[rootID] = replies  
    return rootsDict

In [None]:
# transforms dataframe into dictionary
def dfToDict(df):
    rootsDict = {}
    for i in range(len(df)):
        index = str(df['index'][i])
        if (rootsDict.get(index) == None):
            rootsDict[index] = [int(df['tweetID'][i])]
        else:
            arr = rootsDict.get(index)
            arr.append(int(df['tweetID'][i]))
            arr = sorted(arr)
            rootsDict[index] = arr
    return rootsDict

In [None]:
# removes tweets from dictionary
def removeTweetsFromDict(removable, rootsDict):
    for rootID in removable:
        rootIndex = getRootIndex(rootsDict,rootID)
        rootsDict[rootIndex].remove(rootID)
    return rootsDict

In [None]:
# hydrates dictionary into a dataframe
def hydrateDictIntoDF(rootsDict, truth_label, simple = False):
    data = []
    completed_ids = []
    for i in range(20):
        try:
            arr = rootsDict[str(i)]
        except: 
            continue
        for j in range(len(arr)):
            try:
                status = api.statuses_lookup([arr[j]], tweet_mode="extended")[0]._json
                # if status is a retweet
                try:
                    OG_id = status['retweeted_status']['id']
                    status = api.statuses_lookup([OG_id], tweet_mode="extended")[0]._json
                except:
                    pass
                # if status has already been looked at
                if (status['id'] in completed_ids):
                    continue
                else:
                    completed_ids.append(status['id'])
            except:
                continue
            tweetInfo = [truth_label, i]
            tweetInfo = tweetInfo + getTweetInfo(status)
            data.append(tweetInfo)
            # skip looking at reply roots if in simple mode
            if simple:
                continue
            # if status is a reply
            try:
                while (True):
                    root_id = status['in_reply_to_status_id']
                    status = api.statuses_lookup([root_id], tweet_mode="extended")[0]._json
            except:
                pass
            # if status has already been looked at
            if (status['id'] in completed_ids):
                continue
            else:
                completed_ids.append(status['id'])
            tweetInfo = [truth_label, i]
            tweetInfo = tweetInfo + getTweetInfo(status)
            data.append(tweetInfo)
    df = listToDF(data, truth_labels=True)
    return df

## Get Viral Original Tweets

In [None]:
# retrieves the most viral tweets
def getViralOriginalTweetsFromDF(RTthreshold, df):
    df_viral = df[(df['#Retweets'] > RTthreshold) & (df['isRT'] == False)]
    return df_viral

In [None]:
# retrieves the most viral tweets from a list of file names
def getViralOriginalTweetsFromFileList(RTthreshold, file_names):
    for i in range(len(file_names)):
        df_day = pd.read_pickle(file_names[i])
        if (i == 0):
            df = getViralOriginalTweetsFromDF(RTthreshold, df_day)
        else:
            df = df.append(getViralOriginalTweetsFromDF(RTthreshold, df_day))
    pklname = 'viral_OG_tweets_' + str(RTthreshold) + '.pkl'
    df.to_pickle(pklname)
    return df

## Twitter Counts Endpoint

In [None]:
# prepares text query to be inputted into the counts endpoint
def prepareCountsTextQuery(df):
    # create regular expressions to filter out parts of the queries that are incompatible with the Twitter Counts endpoint
    # remove links
    http_links = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    # remove new lines
    newlines = re.compile('\n')
    # remove apostrophes
    apos = re.compile("[\S]*'[\S]*")
    # remove quotation marks
    quotes = re.compile('[\S]*(“|"|”|’|$)[\S]*')
    # remove dollar signs
    dols = re.compile('[\S]*[$][\S]*')
    # remove colons
    cols = re.compile('[:][\S]*')
    # remove & symbol
    and_sym = re.compile('&amp;')
    # remove parentheses
    paren = re.compile('[(|)]')
    text_query = []
    for i in range(len(df)):
        # filter out incompatible parts of queries using regular expressions
        query = df['text'][i]
        query = http_links.sub('',query)
        query = newlines.sub(' ',query)
        query = apos.sub('',query)
        query = quotes.sub('',query)
        query = dols.sub('',query)
        query = cols.sub('',query)
        query = and_sym.sub('',query)
        query = paren.sub('',query)
        # constrain the query to only search for retweets of the original author of the Tweet
        query = 'retweets_of:' + df['user_name'][i] + ' ' + query
        text_query.append(query)
    return text_query

In [None]:
# prepares input information for the counts endpoint
def prepareCountsInputs(df, mod_text, index, today_dt):
    if (df['#Retweets'][index] == 0):
        print("NO RETWEETS: SKIP")
        return None, None, None
    else:
        print('tweetID')
        print(df['tweetID'][index])
        print()
        print("Original text:")
        print(df['text'][index])
        print()
        print("Modified text:")
        print(mod_text[index])
        print()
        print("Retweets:", df['#Retweets'][index])
        query = mod_text[index]
        status_date = df['date'][index]
        start_dt = datetime.strptime(status_date, "%Y-%m-%d %H:%M:%S")
        start_date = start_dt.strftime("%Y%m%d%H") + '00'
        end_dt = start_dt + timedelta(days = 31)
        end_dt = min(end_dt, today_dt)
        end_date = end_dt.strftime("%Y%m%d%H") + '00'
        print("Created at", status_date)
        return query, start_date, end_date

In [None]:
# query counts endpoint
def getCounts(query, start_date, end_date, bearer_token):
    url = 'https://api.twitter.com/1.1/tweets/search/fullarchive/fullenv/counts.json'
    headers = {'authorization': bearer_token, 'content-type': 'application/json'}
    payload = '{"query":"' + query + '", "fromDate":"' + start_date + '", "toDate":"' + end_date + '", "bucket":"hour"}'
    r = requests.post(url, data=payload, headers=headers)
    return r

In [None]:
# exports results to JSONL file
def exportCountsToJsonl(r, output_name, index):
    file_name = output_name + "%02d" % index + '.jsonl'
    with jsonlines.open(file_name, mode='w') as writer:
        writer.write(r.json())

In [None]:
# retrieves file names that meet a given criteria
def retrieveFileNames(starts = '', ends = '', lengthMin=0, lengthMax=math.inf):
    file_names = []
    for path in Path().iterdir():
        if path.name.startswith(starts) and path.name.endswith(ends) and len(path.name) >= lengthMin and len(path.name) <= lengthMax:
            file_names.append(path.name)
        file_names.sort()
    return file_names

In [None]:
# processes a JSONL file containing counts output
def processCounts(file_name):
    f = jsonlines.open(file_name,)
    counts = []
    for s in f.iter():
        for i in range(len(s['results'])):
            counts.append(s['results'][i]['count'])
        while (len(counts) < 744):
            counts.append(0)
    return counts

In [None]:
# processes an array of counts
def processCountsArray(file_names):
    counts = []
    for file_name in file_names:
        single_counts = processCounts(file_name)
        counts.append(single_counts)
    return counts

## Aggregate Analysis

In [None]:
# gets the structural characteristics of Tweets in a dataframe
def getUserMentionsStats(df):
    num_tagged_statuses = 0
    num_tags = 0
    for i in range(len(df)):
        if (len(np.array(df.loc[[i]]['mentions'])[0]) > 0):
            num_tagged_statuses += 1
            num_tags += len(np.array(df.loc[[i]]['mentions'])[0])
    return [num_tagged_statuses, num_tags]

In [None]:
# gets the number of statuses containing media
def getNumStatusesWithMedia(df):
    if ('media' in df):
        num_statuses_w_media = 0
        num_statuses_w_pic = 'N/A'
        num_statuses_w_video = 'N/A'
        for i in range(len(df)):
            try:
                temp = len(np.array(df.loc[[i]]['media'])[0])
                num_statuses_w_media += 1
            except:
                pass
    elif ('pic_url' and 'video_url' in df):
        num_statuses_w_media = 0
        num_statuses_w_pic = 0
        num_statuses_w_video = 0
        for i in range(len(df)):
            if (len(df['pic_url'][i]) > 0 or type(df['video_url'][i]) == str):
                num_statuses_w_media += 1
            if (len(df['pic_url'][i]) > 0):
                num_statuses_w_pic += 1
            if (type(df['video_url'][i]) == str):
                num_statuses_w_video += 1
    else:
        return 'N/A'
    return num_statuses_w_media, num_statuses_w_pic, num_statuses_w_video

In [None]:
# gets hashtag stats
def getHashtagsStats(df):
    num_statuses_w_hashtags = 0
    num_hashtags = 0
    for i in range(len(df)):
        if (len(np.array(df.loc[[i]]['hashtags'])[0]) > 0):
            num_statuses_w_hashtags += 1
            num_hashtags += len(np.array(df.loc[[i]]['hashtags'])[0])
    return [num_statuses_w_hashtags, num_hashtags]

In [None]:
# calculates proportion of dataframe consisting of retweets
def calcPropRetweet(df):
    return len(df[df['isRT'] == True]) / len(df)

In [None]:
# gets array division percentile
def getArrayDivisionPercentile(df_col1, df_col2, percentile):
    divisions = (df_col1/df_col2).tolist()
    count = 0
    while (count < len(divisions)):
        if math.isnan(divisions[count]):
            divisions[count] = 0
        if math.isinf(divisions[count]):
            del divisions[count]
        else:
            count += 1
    return np.percentile(divisions,percentile)

In [None]:
# gets engagement metrics of Tweets in a dataframe
def getRetweetsFavsFollowersInfo(df, dataset_name):
    n = len(df)
    avg_RT = np.mean(df['#Retweets'])
    avg_favs = np.mean(df['#Favorites'])
    RTsToFavs = avg_RT/avg_favs
    RTsToFavs_75 = getArrayDivisionPercentile(df['#Retweets'],df['#Favorites'], 75)
    if ('#Followers' and 'verified' in df):
        prop_RT = calcPropRetweet(df)
        avg_fols = np.mean(df['#Followers'])
        med_fols = np.median(df['#Followers'])
        RTsToFols = avg_RT/avg_fols
        RTsToFols_75 = getArrayDivisionPercentile(df['#Retweets'],df['#Followers'], 75)
        prop_verified = sum(df['verified'])/n
    else:
        prop_RT = 'N/A'
        avg_fols = 'N/A'
        med_fols = 'N/A'
        RTsToFols = 'N/A'
        RTsToFols_75 = 'N/A'
        prop_verified = 'N/A'
    if ('#Comments' in df):
        avg_comments = np.mean(df['#Comments'])
    else:
        avg_comments = 'N/A'
    data = [[dataset_name,n,prop_RT,avg_RT,avg_favs,avg_comments,med_fols,avg_fols,RTsToFols,RTsToFols_75,RTsToFavs,RTsToFavs_75,prop_verified]]
    result = pd.DataFrame(data, columns = ['Name', 'n','prop_RT','avg_RT','avg_favs','avg_comments','med_fols','avg_fols','RTsToFols','RTsToFols_75','RTsToFavs','RTsToFavs_75','prop_verified'])
    return result

In [None]:
# gets structural characteristics of Tweets in a dataframe
def getTagsMediaHashtagsInfo(df, dataset_name):
    n = len(df)
    num_statuses_w_media, num_statuses_w_pic, num_statuses_w_video = getNumStatusesWithMedia(df)
    prop_media = num_statuses_w_media / n
    if ('mentions' and 'hashtags' in df):
        prop_tagged = getUserMentionsStats(df)[0] / n
        avg_num_tags = getUserMentionsStats(df)[1] / getUserMentionsStats(df)[0]
        prop_hashtagged = getHashtagsStats(df)[0] / n
        avg_num_hashtags = getHashtagsStats(df)[1] / getHashtagsStats(df)[0]
        prop_pic = 'N/A'
        prop_video = 'N/A'
    elif ('pic_url' and 'video_url' in df):
        prop_tagged = 'N/A'
        avg_num_tags = 'N/A'
        prop_hashtagged = 'N/A'
        avg_num_hashtags = 'N/A'
        prop_pic = num_statuses_w_pic / n
        prop_video = num_statuses_w_video / n
    data = [[dataset_name, n, prop_tagged, avg_num_tags, prop_media, prop_pic, prop_video, prop_hashtagged, avg_num_hashtags]]
    result = pd.DataFrame(data, columns = ['Name','n', 'prop_tagged', 'avg_num_tags', 'prop_media', 'prop_pic', 'prop_video', 'prop_hashtagged', 'avg_num_hashtags'])
    return result

## Natural Language Processing

In [None]:
# sorts dictionary by value
def sortDictByValue(dict):
    sorted_dict = {}
    sorted_values = sorted(dict.values(), reverse=True)
    for i in sorted_values:
        for key in dict.keys():
            if dict[key] == i:
                sorted_dict[key] = dict[key]
    return sorted_dict

In [None]:
# counts POS proportions
def countPOSProportions(text_arr, numPOS):
    pos = {}
    for text in text_arr:
        tokens = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokens)
        for tag in tagged:
            pos[tag[1]] = pos.get(tag[1],0) + 1
    pos = sortDictByValue(pos)
    pos_prop = pos.copy()
    total = sum(pos.values())
    for key in pos_prop.keys():
        pos_prop[key] = round(pos_prop[key] / total,3)
    return list(pos_prop.items())[:numPOS]

In [None]:
# gets sentiment polarity
def getSentimentPolarity(text_arr, sorted = False):
    polarity_arr = []
    for i in range(len(text_arr)):
        polarity_arr.append(textblob.TextBlob(text_arr[i]).sentiment.polarity)
    if sorted:
        polarity_arr = sorted(polarity_arr)
    return polarity_arr

In [None]:
# gets sentiment subjectivity
def getSentimentSubjectivity(text_arr, sorted = False):
    subjectivity_arr = []
    for i in range(len(text_arr)):
        subjectivity_arr.append(textblob.TextBlob(text_arr[i]).sentiment.subjectivity)
    if sorted:
        subjectivity_arr = sorted(subjectivity_arr)
    return subjectivity_arr

In [None]:
# gets array percentiel
def getArrPercentile(arr, percentile):
    return sorted(arr)[int(percentile*len(arr))]

In [None]:
# performs sentiment analysis
def sentimentAnalysis(text_arr, dataset_name):
    polarity = getSentimentPolarity(text_arr)
    avg_polarity = np.mean(polarity)
    std_polarity = np.std(polarity)
    fifth_perc_polarity = getArrPercentile(polarity, 0.05)
    ninetyfifth_perc_polarity = getArrPercentile(polarity, 0.95)
    avg_subjectivity = np.mean(getSentimentSubjectivity(text_arr))
    data = [[dataset_name, avg_polarity, std_polarity, fifth_perc_polarity, ninetyfifth_perc_polarity, avg_subjectivity]]
    df = pd.DataFrame(data, columns = ['Name', 'Avg polarity', 'Std polarity', '5th perc. polarity', '95th perc. polarity', 'Avg subjectivity'])
    return df