## Pre-process training tweets (make token lists by tweet for later use in vector creation)

Filter out stopwords, make lowercase, concatenate tweets that are Russian and non-Russian

In [1]:
import gensim
from nltk.corpus import stopwords
import numpy as np
import scipy as sp
import re
from sklearn.cluster import KMeans
import nltk

stoplist = stopwords.words('english')

In [2]:
#### READ IN THE TWEETS

tweets = []     # original tweets
tweettoks = []  # list of lists of tokens by tweet
allTokens = []  # all of the tokens in Russian tweets
 
# Real tweets
f = open("real_stemmed.txt")
for line in f:
    line = line.rstrip()
    tweets.append(line)    
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)  # remove digits
    addMe = []
    for token in line.split():
        lowercaseToken = token.lower()
        if lowercaseToken not in stoplist:
            allTokens.append(lowercaseToken)
            addMe.append(lowercaseToken)
    tweettoks.append(addMe)
f.close()

# Russian bot tweets
f = open("russian_stemmed.txt")
for line in f:
    line = line.rstrip()
    tweets.append(line)    
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)  # remove digits
    addMe = []
    for token in line.split():
        lowercaseToken = token.lower()
        if lowercaseToken not in stoplist:
            allTokens.append(lowercaseToken)
            addMe.append(lowercaseToken)
    tweettoks.append(addMe)
f.close()

In [3]:
def printTweetTokenInfo():
    print("Length of tweettoks")
    print(len(tweettoks))
    print()

    print("Should be same as the number of tweets")
    print("Is same?")
    print(len(tweettoks)==len(tweets))
    print()

    print("EXAMPLE: First five tweets")
    print(tweets[0:5])
    print()
    
    print("EXAMPLE: First five tweets' token lists")
    print(tweettoks[0:5])
    print()
    
    print("Total number of tokens")
    print(len(allTokens))
    print()
    
    print("EXAMPLE: First 10 tokens (of all tweets)")
    print(allTokens[0:10])
    print()

In [4]:
#printTweetTokenInfo()  # Sanity check

In [5]:
# Remove any stopwords from most common 100 that seem superfluous (in addition to stopwords compiled from PS3)

# Execute the commented out code to see the most common 100 tokens that we used to select additional stopwords
#fdist = nltk.FreqDist(allTokens)
#print(fdist.most_common(100))

PS3StopWords = ["ever", "one", "do","does","make", "go", "us", "to", "get", "about", "may", "s", ".", ",", "!", "i", "I", '\"', "?", ";", "--", "--", "would", "could", "”", "Mr.", "Miss", "Mrs.", "don’t", "said", "can't", "didn't", "aren't", "I'm", "you're", "they're", "'s"]
stoplist.extend(PS3StopWords)
personalAdditions = ["it", "i'm", "|", "–", "-", "~", "you'r", "thing", '"', "…", '…"', ""]
stoplist.extend(personalAdditions)

In [6]:
## RECALCULATE THE TOKEN LISTS WITHOUT NEW STOPWORDS

tweettoks = []  # list of lists of tokens by tweet, EMPTY

# Real tweets
f = open("real_stemmed.txt")
for line in f:
    line = line.rstrip()
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)  # remove digits
    addMe = [token.lower() for token in line.split() if token.lower() not in stoplist]
    tweettoks.append(addMe)
f.close()

# Russian bot tweets
f = open("russian_stemmed.txt")
for line in f:
    line = line.rstrip()
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)  # remove digits
    addMe = [token.lower() for token in line.split() if token.lower() not in stoplist]
    tweettoks.append(addMe)
f.close()



In [7]:
#printTweetTokenInfo()  # Sanity check

## Identify tweet topics for training data (using tweet token lists created above)


Identify topics in the tweets using word2vec word embeddings.

In [8]:
#### Create the word2vec model used to make tweet vectors in semantic space

bigmodel = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300-SLIM.bin", binary=True)

#### OPTIONAL: Read in an array of new sentences to additionally train the model on those
####            this is in the case of obtaining a corpus of tweet / slang language that does not overlap with training data
#model.build_vocab(new_sentences, update=True)
#model.train(new_sentences)

In [9]:
#### Using a word2vec model (taken from GoogleNews), read in the normalized vectors for each token, and sum the vectors to create a single tweet vector. Store that vector for future use / k-means clustering.

tweetvectors = []   # this list will contain one 300-dimensional vector per headline

for tweetTokens in tweettoks:
    totvec = np.zeros(300)
    for tok in tweetTokens:
        if tok.lower() in bigmodel:
            totvec = totvec + bigmodel[tok.lower()]
    tweetvectors.append(totvec)

#print(len(tweetvectors))      # Check that the number of vectors and tweets are the same
#print(len(tweets))            #     this number should be the same as the last
#print(len(tweetvectors[10]))  # Check to make sure this is a 300-Dimensional array

In [12]:
#### Use K-means clustering to sort our testing tweets into num_topics topics
num_topics = 50

kmtweets = KMeans(n_clusters=num_topics, random_state=0)
tweetclusters = kmtweets.fit_predict(tweetvectors)

In [2]:
import pickle 

filename = 'kmTweetsModel.sav'
pickle.dump(kmtweets, open(filename, 'wb'))
 
# some time later...


NameError: name 'kmtweets' is not defined

In [None]:
import pickle

# load the model from disk
loaded_kmTweetsModel = pickle.load(open('kmTweetsModel.sav', 'rb'))

In [13]:
##### Print out all the headlines that belong to one of the clusters. (UNCOMMENT CODE TO PRINT)

#desired_topic_cluster = 30
#
#for i in range(len(tweetclusters)):
#    if tweetclusters[i] == desired_topic_cluster:
#        print(tweets[i])

# Method to sort incoming tweets into the nearest cluster

This method will be used to sort test tweets into their nearest cluster, which will be included in our classifier as a feature from 1-num_clusters
This method takes in an array of tweets, and returns their intended cluster (based upon the above model)

(Our training data will use the cluster assigned to it-- contained in the list tweetclusters -- above as a fetaure in the final classifier.)

In [27]:
def assignClusters(tweets):
    
    ##### Separate each tweet by tokens
    
    toksByTweet = []

    for tweet in tweets:
        line = tweet.rstrip()
        line = re.sub(r"(^| )[0-9]+($| )", r" ", line)  # remove digits
        addMe = [token.lower() for token in line.split() if token.lower() not in stoplist]
        toksByTweet.append(addMe)
        
    ##### Turn each tweet's token list into a normalized (based upon the model, not the training tweets) vector
    
    vectors = []   # this list will contain one 300-dimensional vector per headline

    for toks in toksByTweet:
        totvec = np.zeros(300)
        for tok in toks:
            if tok.lower() in bigmodel:
                totvec = totvec + bigmodel[tok.lower()]
        vectors.append(totvec)
        
    
    #### Return the predicted topic

    return loaded_kmTweetsModel.predict(vectors)

In [29]:
print("Let's check, does it correctly classify the first 50 training tweets?")        # Uncomment for a sanity check
print(assignClusters(tweets[0:50])==tweetclusters[0:50])

Let's check, does it correctly classify the first 50 training tweets?
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]
