In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from gensim.models import LdaModel
from gensim.corpora import Dictionary

def preprossing_data(data):
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(data)):
        data[idx] = data[idx].lower()  # Convert to lowercase.
        data[idx] = tokenizer.tokenize(data[idx])  # Split into words.

    # Remove pure numbers.
    data = [[token for token in doc if not token.isnumeric()] for doc in data]
    # Remove one character word.
    data = [[token for token in doc if len(token) > 1] for doc in data]
    # Remove stop words.
    stop_words = stopwords.words('english')
    data = [[word for word in doc if word not in stop_words] for doc in data]
    # Remove the words that i think is meaningless
    my_stop_words = ['rt','http']
    data = [[word for word in doc if word not in my_stop_words] for doc in data]
    # Lemmatize the documents.
    lemmatizer = WordNetLemmatizer()
    data = [[lemmatizer.lemmatize(token) for token in doc] for doc in data]
    
    # Compute bigrams.
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(data, min_count=20)
    for idx in range(len(data)):
        for token in bigram[data[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                data[idx].append(token)
                
    return data


def train_model(corpus, id2word, chunksize, iterations, num_topics, passes): 
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every= None
    )
    
    # save model 
    # model.save('LDA_model_v1.model')
    
    return model
    

In [2]:
# Set file parameters.
file_path = "tweets.csv"
# Set training parameters.
num_topics = 10
chunksize = 8000
passes = 20
iterations = 400


# read in file
data = pd.read_csv(file_path)
tweetslist = data['tweet '].values
#print(len(tweetslist))
tweetslist = preprossing_data(tweetslist)

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(tweetslist)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.6)

# Save dict
dictionary.save_as_text("my_dictionary.txt")
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in tweetslist]

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token


# Get train model
model = train_model(corpus, id2word, chunksize, iterations, num_topics, passes)

# Test

In [14]:
from operator import itemgetter

test = tweetslist[51]
bow = dictionary.doc2bow(test)
result = model.get_document_topics(bow)
result = sorted(result, key=itemgetter(1), reverse=True)
print(model.print_topic(result[0][0],1))

0.025*"work"


In [13]:
print(test)

['mrdigitalafrica', 'really', 'need', 'design', 'thinking', 'mindset', 'entrepreneurial', 'journey', 'encourage', 'entrepreneur', 'take', 'part', 'design_thinking']
