# COVID-TWITTER

Here we'll import all our needed modules

In [None]:
import json
import datetime
import numpy as np
import dateutil.parser
import operator
import pytz
import matplotlib.pyplot as plt

def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

Then we'll do an import of the twitter data

In [None]:
filePath='data/twitter.json'
localTweetList = []
globalTweetCounter = 0
frequencyMap = {}
people = {}
timeFormat = "%a %b %d %H:%M:%S +0000 %Y"
with open(filePath, 'r') as f:
    tweets = json.loads(f.readline())
    for tweet in tweets:
        # Try to extract the time of the tweet
        currentTime = dateutil.parser.parse(tweet['created_at'])
        currentTime = currentTime.replace(hour=0, minute=0, second=0)

        # print(currentTime)
        # Increment tweet count
        globalTweetCounter += 1
        
        # If our frequency map already has this time, use it, otherwise add
        if currentTime in frequencyMap.keys():
            timeMap = frequencyMap[currentTime]
            timeMap["count"] += 1
            timeMap["list"].append(tweet)
        else:
            frequencyMap[currentTime] = {"count":1, "list":[tweet]}

        # If our user is already added, use, otherwise add
        if tweet['user']['screen_name'] in people:
            people[tweet['user']['screen_name']].append(tweet)
        else:
            people[tweet['user']['screen_name']] = [tweet]

We will then filter take the words before and after the first case of covid-19 in the united states (2020-01-21)

In [None]:
utc = pytz.UTC
targetTime = utc.localize(dateutil.parser.parse('2020-01-21'))
wordsBefore = {}
wordsAfter = {}
for person in people:
    print(person, len(people[person]))
    # count number of words used
    wordsBefore[person] = {}
    wordsAfter[person] = {}
    for tweet in people[person]:
        # split text and clean words
        tempWords = tweet['text'].split()
        tempWords = [deEmojify(i).strip().lower() for i in tempWords]
        # check if before or after first case of covid and add to dictionary
        timeCreated = dateutil.parser.parse(tweet['created_at'])
        timeCreated = timeCreated.replace(hour=0, minute=0, second=0)

        # before first case
        if timeCreated < targetTime:
            for word in tempWords:
                if not word:
                    continue
                if word in wordsBefore[person]:
                    wordsBefore[person][word] += 1
                else:
                    wordsBefore[person][word] = 1
        # after first case 
        else:
            for word in tempWords:
                if not word:
                    continue
                if word in wordsAfter[person]:
                    wordsAfter[person][word] += 1
                else:
                    wordsAfter[person][word] = 1

Then we'll remove the filler words.

These words are useful in language as it makes the sentences more coherent, but we don't care about that in our dataset

In [None]:
entriesToRemove = ('the', 'and', 'to', 'of', 'a', 'an', 'are', 'in', 'is', 'on')
adjustedBefore = wordsBefore
adjustedAfter = wordsAfter
for person in people:
    for k in entriesToRemove:
        adjustedBefore[person].pop(k, None)
        adjustedAfter[person].pop(k, None)

    adjustedBefore[person] = sorted(adjustedBefore[person].items(), key=lambda item: item[1], reverse=True)
    adjustedAfter[person] = sorted(adjustedAfter[person].items(), key=lambda item: item[1], reverse=True)

    print('\t', adjustedBefore[person][:10])
    print('\t', adjustedAfter[person][:10])

Then we'll plot each one

In [None]:
for person in people:
    fig, ax = plt.subplots()
    fig.set_size_inches(18,6)
    fig.tight_layout()
    fig.subplots_adjust(bottom=0.15)
    plt.subplot(1, 2, 1)
    
    plt.title("Word Frequency: " + person + " (Before First Case of Covid: 2020-01-21)")
    plt.xticks(rotation=45, ha="right") 
    plt.bar([x[0] for x in adjustedBefore[person]][:30], [x[1] for x in adjustedBefore[person]][:30], color='blue', label='Word Frequency')
    

    plt.subplot(1, 2, 2)
    plt.title("Word Frequency: " + person + " (After First Case of Covid: 2020-01-21)")
    plt.xticks(rotation=45, ha="right") 
    plt.bar([x[0] for x in adjustedAfter[person]][:30], [x[1] for x in adjustedAfter[person]][:30], color='red', label='Word Frequency')
    ax.legend()
    plt.show();

Great, now we'll plot frequency of tweets. We'll use this to compare to the number of cases of covid-19 later

In [None]:
# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime

#timeIntervalStep = datetime.timedelta(0, 60)    # Time step in seconds
timeIntervalStep = datetime.timedelta(hours=24)
while ( thisTime <= lastTime ):
    if ( thisTime not in frequencyMap.keys() ):
        frequencyMap[thisTime] = {"count":0, "list":[]}
        
    thisTime = thisTime + timeIntervalStep

print ("Processed Tweet Count:", globalTweetCounter)

Now we plot it

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,4)
fig.tight_layout()
fig.subplots_adjust(bottom=0.3)

plt.title("Tweet Frequency (First case of COVID-19 at 2020-01-21)")

# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())

# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])

# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]

# We'll have ticks every 10 days
smallerXTicks = range(0, len(sortedTimes), 10)
plt.xticks(smallerXTicks, [sortedTimes[x].strftime('%Y:%m:%d') for x in smallerXTicks], rotation=45, ha="right")

# Plot the post frequency
ax.plot(range(len(frequencyMap)), [x if x > 0 else 0 for x in postFreqList], color="blue", label="Posts")
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()

We'll now import the covid-19 case data

In [None]:
# covid data
covid = np.loadtxt("data/us-counties.csv", delimiter=",", dtype='str')
covid = np.delete(covid, (0), axis=0)
cases = {}
for time in sortedTimes:
    cases[time] = 0

count = 0
for reported in covid:
    time = utc.localize(dateutil.parser.parse(reported[0]))
    count += int(reported[4])
    cases[time] = count

postCaseList = np.array([cases[x] for x in cases])

and again we plot it

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,4)
fig.tight_layout()
fig.subplots_adjust(left=0.1, bottom=0.3)

plt.title("Reported Cases of COVID-19 (First case at 2020-01-21)")
plt.xticks(smallerXTicks, [sortedTimes[x].strftime('%Y:%m:%d') for x in smallerXTicks], rotation=45, ha="right")
ax.plot(range(len(sorted([k for k in cases]))), postCaseList, color="red", label='Reported Cases')
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()

Now let's look at hashtags

In [None]:
# A map for hashtag counts
hashtagCounter = {}

# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        hashtagList = tweet["entities"]["hashtags"]
        
        for hashtagObj in hashtagList:
            
            # We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
            hashtagString = hashtagObj["text"].lower()
            
            if ( hashtagString not in hashtagCounter ):
                hashtagCounter[hashtagString] = 1
            else:
                hashtagCounter[hashtagString] += 1

print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)

then plot the first 20

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,4)
fig.subplots_adjust(bottom=0.3)

plt.title("Hashtag Count")
plt.xticks(range(20), [ht for ht in sortedHashtags][:20], rotation=45, ha="right")
ax.bar(range(20), [hashtagCounter[ht] for ht in sortedHashtags][:20], color="red", label='Hashtags')
ax.grid(b=True, which=u'major')
ax.legend()
plt.show()

Now we'll make a bag of words as the dataset

In [None]:
labelsBefore = []
labelsAfter = []
wordsBefore = []
wordsAfter = []
for person in people:
    for tweet in people[person]:
        timeCreated = dateutil.parser.parse(tweet['created_at']).replace(hour=0, minute=0, second=0)
        if timeCreated < targetTime:
            wordsBefore.append(tweet['text'])
            labelsBefore.append(person.lower())
        else:
            wordsAfter.append(tweet['text'])
            labelsAfter.append(person.lower())

from sklearn.feature_extraction.text import CountVectorizer
vectorizerBefore = CountVectorizer(min_df=5)
featuresBefore = vectorizerBefore.fit_transform(wordsBefore)

vectorizerAfter = CountVectorizer(min_df=5)
featuresAfter = vectorizerAfter.fit_transform(wordsAfter)

print(featuresBefore.toarray())
print(featuresAfter.toarray())
# print(vectorizer.get_feature_names())

We'll then train the data!

First we import everything we need

In [None]:
from sklearn.model_selection import train_test_split
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, auc

Then we'll train on the data before the first case of covid-19

In [None]:
X_train, X_test, y_train, y_test = train_test_split(featuresBefore, labelsBefore, test_size=0.33, random_state=42) 
print("-----BEFORE-----")
# decision tree
dTree = DecisionTreeClassifier()
start_time = time.time()
dTree = dTree.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('Decision Tree', time.time() - start_time))

accuracy = dTree.score(X_test, y_test)
print("decision tree accuracy: %.3f" %accuracy)

# knn
k = 20
predictor = neighbors.KNeighborsClassifier(k)
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('KNN', time.time() - start_time))

accuracy = predictor.score(X_test, y_test)
print("KNN accuracy: %.3f" %accuracy)

# random forest
predictor = RandomForestClassifier()
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('random forest', time.time() - start_time))

predictor.predict(X_test)
accuracy = predictor.score(X_test, y_test)
print("random forest: %.3f" %accuracy)

# logistic regression
clf = LogisticRegression(solver='lbfgs')
start_time = time.time()
clf.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('LogisticRegression', time.time() - start_time))
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print('Log Regression: %.3f' % (accuracy))

Now after first case

In [None]:
X_train, X_test, y_train, y_test = train_test_split(featuresAfter, labelsAfter, test_size=0.33, random_state=42) 
print("-----After-----")
# decision tree
dTree = DecisionTreeClassifier()
start_time = time.time()
dTree = dTree.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('Decision Tree', time.time() - start_time))

accuracy = dTree.score(X_test, y_test)
print("decision tree accuracy: %.3f" %accuracy)

# knn
k = 20
predictor = neighbors.KNeighborsClassifier(k)
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('KNN', time.time() - start_time))

accuracy = predictor.score(X_test, y_test)
print("KNN accuracy: %.3f" %accuracy)

# random forest
predictor = RandomForestClassifier()
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('random forest', time.time() - start_time))

predictor.predict(X_test)
accuracy = predictor.score(X_test, y_test)
print("random forest: %.3f" %accuracy)

# logistic regression
clf = LogisticRegression(solver='lbfgs')
start_time = time.time()
clf.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('LogisticRegression', time.time() - start_time))
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print('Log Regression: %.3f' % (accuracy))

Now we'll get data not in the dataset and test in on that rather than splitting (This dataset was also retrieved using twitterData.py, but with changing the dates).

This one is for 2019-08-17 to 2019-08-31

In [None]:
peopleBeforeText = []
peopleBeforeNames = []
with open("data/twitterBefore.json", 'r') as f:
    tweets = json.loads(f.readline())
    for tweet in tweets:
        peopleBeforeText.append(tweet['text'])
        peopleBeforeNames.append(tweet['user']['screen_name'])
print(len(peopleBeforeText))

This one is for 2020-04-01 to 2020-04-15

In [None]:
peopleAfterText = []
peopleAfterNames = []
with open("data/twitterAfter.json", 'r') as f:
    tweets = json.loads(f.readline())
    for tweet in tweets:
        peopleAfterText.append(tweet['text'])
        peopleAfterNames.append(tweet['user']['screen_name'])
print(len(peopleAfterText))

Now we test!

In [None]:
beforeFeatures = vectorizerBefore.transform(peopleBeforeText)
X_train, y_train = featuresBefore, labelsBefore

# decision tree
dTree = DecisionTreeClassifier()
start_time = time.time()
dTree = dTree.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('Decision Tree', time.time() - start_time))

accuracy = dTree.score(beforeFeatures, peopleBeforeNames)
print("decision tree accuracy: %.3f" %accuracy)

# knn
k = 20
predictor = neighbors.KNeighborsClassifier(k)
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('KNN', time.time() - start_time))

accuracy = predictor.score(beforeFeatures, peopleBeforeNames)
print("KNN accuracy: %.3f" %accuracy)

# random forest
predictor = RandomForestClassifier()
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('random forest', time.time() - start_time))

predictor.predict(beforeFeatures)
accuracy = predictor.score(beforeFeatures, peopleBeforeNames)
print("random forest: %.3f" % accuracy)

# logistic regression
clf = LogisticRegression(solver='lbfgs')
start_time = time.time()
clf.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('LogisticRegression', time.time() - start_time))
y_pred = clf.predict(beforeFeatures)
print("log reg: %.3f" % accuracy_score(peopleBeforeNames, y_pred))

compared to the more recent tweets

In [None]:
afterFeatures = vectorizerAfter.transform(peopleAfterText)
X_train, X_test, y_train, y_test = train_test_split(featuresAfter, labelsAfter, test_size=0.33, random_state=42) 

# decision tree
dTree = DecisionTreeClassifier()
start_time = time.time()
dTree = dTree.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('Decision Tree', time.time() - start_time))

accuracy = dTree.score(afterFeatures, peopleAfterNames)
print("decision tree accuracy: %.3f" %accuracy)

# knn
k = 20
predictor = neighbors.KNeighborsClassifier(k)
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('KNN', time.time() - start_time))

accuracy = predictor.score(afterFeatures, peopleAfterNames)
print("KNN accuracy: %.3f" %accuracy)

# random forest
predictor = RandomForestClassifier()
start_time = time.time()
predictor.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('random forest', time.time() - start_time))

predictor.predict(afterFeatures)
accuracy = predictor.score(afterFeatures, peopleAfterNames)
print("random forest: %.3f" % accuracy)

# logistic regression
clf = LogisticRegression(solver='lbfgs')
start_time = time.time()
clf.fit(X_train, y_train)
print('Time for %s fitting: %.3f' % ('LogisticRegression', time.time() - start_time))
y_pred = clf.predict(afterFeatures)
print("log reg: %.3f" % accuracy_score(peopleAfterNames, y_pred))