In [None]:
import sys
sys.path

In [None]:
import re

# Data Manipulation
import numpy as np
import pandas as pd


# Gensim model creation tools
!pip install gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


# spacy for english lemmatization
import spacy


# Prepare Stopwords
!pip install nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'timestamp', 'com', 'http', 'www'])


# Plotting
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import networkx as nx


# MySQL & JSON toolkits
import pymysql
import json


# Logging and Debug
import logging
from pprint import pprint


In [5]:
# Import our data from MySQL server
connection = pymysql.connect(host='127.0.0.1', user='root', password='5TRcaSeTr4L', db='Twitter')

Tweets_df = pd.read_sql('SELECT * FROM Tweets LIMIT 1000', con=connection)

#print(Tweets_df.iloc[:,0]) #Random IDs
#print(Tweets_df.iloc[:,1]) #Tweet Text Content
#print(Tweets_df.iloc[:,2]) #Timestamps
#print(Tweets_df.iloc[:,3]) #User IDs

UserIdList = Tweets_df.iloc[:,3].tolist()
UserIdString = str(UserIdList)
UserIdString = UserIdString[1:-1]


Users_df = pd.read_sql(('SELECT * FROM Users WHERE Id IN ('+UserIdString+')'), con=connection)


data = Tweets_df.iloc[:,1].tolist()





You are using pip version 19.0.2, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.
  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


OperationalError: (2003, "Can't connect to MySQL server on '127.0.0.1' ([WinError 10061] No connection could be made because the target machine actively refused it)")

# Natural Language Preparation

Our intention is to break the natural language of our data into a format that can be easily interpreted by our model.
We will begin by calling a method that uses gensim to tokenize our data. This process lowercases, tokenizes, and de-accents the language. The resulting output are final tokens as unicode strings.

The second step will be removing what are called stop words. These are "useless" words in the english language that our model won't be able to do anything with. These include words such as "the", "a", "in", "so" etc. We will use the natural language toolkit, nltk to achieve this.

The third step in our preparation is the development of bi and tri-gram models. In linguistics, n-gram models are used to identify sequences of characters, syllables, or even full words as "graphenes". In our case, we will use the gensim toolkit to identify groupings of words in an attempt to identify words that are often found together. Remember, we have already removed stop words from our data. That means a phrase such as "the new horror movie" will break down into "new horror movie" and "horror movie" as trigrams and bigrams.

Our final step before having fully prepared our data is lemmatization. In our context, lemmatization means understanding words in their context. For example, the root "sleep" may take the form "sleeping", "slept", "sleeps", etc. We will also remove words that aren't Nouns, Adjectives, Verbs, or Adverbs.  



In [None]:
#Tokenize document in GenSim modifiable tokens.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


data_words = list(sent_to_words(data))



# Creating Bigram and Trigram Models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Remove Stopwords, Make Bigrams and Lemmatize
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        #Documents may be required to be cast to Unicode.
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#Methods are defined, let's put our data through.
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Building the LDA Model

We will now call on the gensim library once again to build our LDA topic model. LDA, or Latent Dirichlet Allocation is used in natural language processing to identify topics based on the probability of words. It is a generative statistical model that is trained on a corpus. In our case, the corpus is a prepared selection of tweets.

In [None]:
# Create the Dictionary and Corpus needed for Topic Modeling
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Create corpus bag of words
corpus = [id2word.doc2bow(text) for text in texts]


# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

# Building Topic Model (LDA)
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=50,
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)


# Save model to disk.
lda_model_savefile = datapath("TwitterLdaModel")
lda_model.save(lda_model_savefile)
doc_lda = lda_model[corpus]

# User to Topic Graph Generation

Now that our model has been trained and prepared, there is still some data that we need to preserve in order to continue in the future. Our final steps will be to create a graph of topic to user connetions. It is here where we will use the user data we imported in the begining. We will create a node to node connection between users and topics to be saved as a graph for future use and evaluation of our model.

In [None]:
ndarray = lda_model.get_topics()
pprint(ndarray)


nameList = lda_model.get_topic_terms(2)
pprint(nameList)


user_topic_edges = [(0,0),(0,1),(1,1)]

#For each lemmatized tweet:
ind = 0;
for doc in data_lemmatized:
    q_vec = id2word.doc2bow(doc)

    #Fetch the topic of each tweet, in order to pair it with a UserId.
    topic_vec = lda_model.get_document_topics(q_vec)

    topic_prob = 0;
    topic_id = -1;
    for pair in topic_vec:
        prob = pair[1]
        id = pair[0]

        if prob > topic_prob:
            topic_prob = prob
            topic_id = id


    user_id = UserIdList[ind]

    user_topic_edges.append((topic_id, user_id))

    ind = ind + 1;



# Visualizing and Saving the Network Graph

Closing out our software will produce two outputs. The first will be a visualization of the user topic networks. This should create a network graph with clear communities of users as they relate to the topics within the corpus. The second is exporting our graph in a way where we can use it later, particularly for the temporal network embedding toolset.

In [None]:


G = nx.Graph()
#G.add_nodes_from(data)
G.add_nodes_from(UserIdList)
G.add_edges_from(user_topic_edges)

nx.draw(G)
plt.show()

nx.write_edgelist(G, "Export.edgelist", data=True)
graphJSONData = nx.readwrite.node_link_data(G)

#The following is used to attach to temporal network embedding to C++ through json saving of graphs...

with open('data.json', 'w') as outfile:
    json.dump(graphJSONData, outfile)

vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'Export.html')

# We made it here, we can go anywhere!
# "Don't put song lyrics in the comments" -Ron, probably.
#wikirelate was here

While the actual model is incredibly large and incredibly hard to visualize meanginfully to a human; an example of the user-topic network graph is generated below.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

g = nx.karate_club_graph()
fig, ax = plt.subplots(1, 1, figsize=(8, 6));
nx.draw_networkx(g, ax=ax)