In [None]:
import numpy as np
import json
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm
import pandas as pd
import string

plt.style.use("fivethirtyeight")
%matplotlib inline

In [None]:
# Load the data
twitter_data_dir = "../data/authorship_corpora/influencer_tweets.json"
json_data=open(twitter_data_dir).read()
twitter_data = json.loads(json_data)

## Dataset Details

In [None]:
# How many Tweets
print("Number of Tweets: {}".format(len(twitter_data)))
print()

# How many Users
count_ = {}

for item in twitter_data:
    user = item['user']
    if user not in count_.keys():
        count_[user] = 1
    else:
        count_[user] += 1
    
print("Number of Influencers: {}".format(len(count_.keys())))
print()

print("Top Influencers")
print("---------------")
print()
# Top influencers
for influencer, count in sorted(count_.items(), key=lambda x: x[1], reverse=True)[:10]:
    print("Influencer Twitter ID: {}".format(influencer))
    print("# of tweets: {}".format(count))
    print()

In [None]:
# Distribution of the length of the tweets
char_len = []
word_len = []
tknzr = TweetTokenizer()

# takes 5 mins to run
for tweet in tqdm(twitter_data):
    words = tknzr.tokenize(tweet['text'].lower())
    words = [''.join(c for c in s if c not in list(string.punctuation)) for s in words]
    while '' in tokenized_story:
        tokenized_story.remove('')
    char_len.append(len(tweet['text'].lower()))
    word_len.append(len(words))

In [None]:
char_len = np.array(char_len)
word_len = np.array(word_len)

In [None]:
# Character Length
print("Mean: {}".format(np.mean(char_len)))
print("SD: {}".format(np.std(char_len)))
print("Min: {}".format(np.min(char_len)))
print("25%: {}".format(np.percentile(char_len, 25)))
print("50%: {}".format(np.percentile(char_len, 50)))
print("75%: {}".format(np.percentile(char_len, 75)))
print("Max: {}".format(np.max(char_len)))
plt.hist(char_len, bins = np.arange(min(char_len), max(char_len), 20))
print("Prop. below 140 chars: {}".format(np.mean(char_len<=140)))
plt.title("Character Length in dataset")

There seems to be about 3% of the tweets that exceed those 140 character limit. 

In [None]:
plt.hist(char_len[char_len<=140], bins = np.arange(0, 141, 10))
plt.title("Character Level in Tweets < 140 chars")
tix = plt.xticks(np.arange(0, 141, 10))

In [None]:
# Word Length
print("Mean: {}".format(np.mean(word_len)))
print("SD: {}".format(np.std(word_len)))
print("Min: {}".format(np.min(word_len)))
print("25%: {}".format(np.percentile(word_len, 25)))
print("50%: {}".format(np.percentile(word_len, 50)))
print("75%: {}".format(np.percentile(word_len, 75)))
print("Max: {}".format(np.max(word_len)))
plt.hist(char_len, bins = np.arange(min(word_len), max(word_len), 5))
plt.title("Character Length in dataset")
tix = plt.xticks(np.arange(min(word_len), max(word_len), 5))

In [None]:
# How many tweets contain a link or other username?
text = []
author = []

twitter_username_regex = '@([A-Za-z0-9_]+)'
link_regex = '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'


for t in tqdm(twitter_data):
    text.append(t['text'])
    author.append(t['user'])
    
data = pd.DataFrame.from_dict({"author":author, "text":text})

is_response = lambda x: int(re.search(twitter_username_regex, x) == None)
has_link = lambda x: int(re.search(link_regex, x) == None)
data['char_len'] = data['text'].str.len()
data['is_response'] = data['text'].apply(is_response) 
data['has_link'] = data['text'].apply(has_link) 

data.head()

In [None]:
print("Proportion of Tweets that contain a link: {}".format(np.mean(data['has_link'])))
print("Proportion of Tweets that are a response: {}".format(np.mean(data['is_response'])))

In [None]:
# Let's find all the unique symbols used in the corpus
symbols = {}

for t in tqdm(data['text']):
    for s in t:
        if s not in symbols.keys():
            symbols[s] = 1
        else:
            symbols[s] += 1

In [None]:
# all characters
print("Number of unique characters: {}".format(len(set(symbols.keys()))))
list(set(symbols.keys()))[:10] # emoticons ...

In [None]:
# most popular characters
plt.figure(figsize=(10, 5))
top_100 = sorted(symbols.items(), key = lambda x: x[1], reverse=True)[:100]
plt.bar(np.arange(100), width=1, height = [tmp[1] for tmp in top_100])
tix = plt.xticks(np.arange(100), [tmp[0] for tmp in top_100], rotation=30)
plt.title("Most popular characters")

## Proposed Preprocessing

* Keep tweets with a maximum of 140 characters.
* We need some minimum text length that we can work with.
* What to do with responses/links.
* Some care should be taken when making a vocabulary of characters (introduce <ukn> token perhaps?).

## Training a character level model to learn character embeddings