In [28]:
from ipynb.fs.full.functions import clean_up, tokenize, stem_and_lemmatize, lemmatize,  remove_stopwords
import pandas as pd
import nltk
import pprint
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [29]:
import gensim
import gensim.corpora as corpora
import re
import spacy
from gensim.utils import simple_preprocess

In [2]:
instagram = pd.read_csv("instagram.csv")


## Topic Analysis - experiments

In [None]:
# inspired here https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

Topic modeling is the process of identifying topics in a set of documents.Latent Dirichlet Allocation (LDA) is a probabilistic method for Topic Modelling.

In [10]:

data = instagram.text_processed.tolist()
data[:10]



[['best',
  'offense',
  'good',
  'defense',
  'healthy',
  'tip',
  'boast',
  'immunity',
  'work',
  'covid',
  'scare',
  'scared',
  'take',
  'care'],
 ['nature',
  'reflects',
  'back',
  'u',
  'imbalance',
  'health',
  'earth',
  'ecosystem',
  'compromised',
  'health',
  'also',
  'compromised',
  'part',
  'nature',
  'sickness',
  'becomes',
  'sickness',
  'health',
  'health',
  'art',
  'mimbirose'],
 ['concerning',
  'current',
  'outbreak',
  'covid',
  'coronavirus',
  'ha',
  'declared',
  'pandemic',
  'must',
  'take',
  'precaution',
  'safety',
  'measure',
  'avoid',
  'spread',
  'even',
  'simple',
  'handshake',
  'cause',
  'transfer',
  'harmful',
  'disease',
  'causing',
  'bacteria',
  'safe',
  'way',
  'greet',
  'others',
  'namaste',
  'customary',
  'respectful',
  'indian',
  'greeting',
  'also',
  'part',
  'atmantan',
  'culture'],
 ['lunch',
  'today',
  'daughter',
  'jacket',
  'potato',
  'tuna',
  'cheese',
  'hea',
  'speedy',
  'salad'

In [18]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['best', 'offense', 'good', 'defense', 'healthy', 'tip', 'boast', 'immunity', 'work', 'covid', 'scare', 'scared', 'take', 'care']]


In [19]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words])

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [13]:
# define stopwords

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [20]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [21]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['good', 'good', 'defense', 'healthy', 'immunity', 'work', 'scare', 'take', 'care']]


In [22]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [23]:
# term frequency
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('care', 1),
  ('defense', 1),
  ('good', 2),
  ('healthy', 1),
  ('immunity', 1),
  ('scare', 1),
  ('take', 1),
  ('work', 1)]]

In [26]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [27]:
# Print the Keyword in the 10 topics
lda_model.print_topics()


[(0,
  '0.135*"plan" + 0.127*"meal" + 0.086*"case" + 0.058*"snack" + 0.049*"super" + 0.037*"listen" + 0.034*"throw" + 0.029*"death" + 0.028*"inflammatory" + 0.025*"lack"'),
 (1,
  '0.133*"safe" + 0.084*"rid" + 0.077*"must" + 0.069*"avoid" + 0.044*"practice" + 0.043*"burn" + 0.040*"topic" + 0.034*"young" + 0.034*"heat" + 0.026*"human"'),
 (2,
  '0.147*"mix" + 0.141*"cook" + 0.062*"egg" + 0.041*"roll" + 0.034*"inch" + 0.010*"splash" + 0.002*"lime" + 0.000*"slice" + 0.000*"soup" + 0.000*"sauce"'),
 (3,
  '0.124*"find" + 0.079*"call" + 0.065*"challenge" + 0.043*"stop" + 0.040*"leave" + 0.033*"list" + 0.032*"able" + 0.032*"habit" + 0.032*"client" + 0.029*"daily"'),
 (4,
  '0.084*"painful" + 0.075*"fear" + 0.064*"capable" + 0.050*"battle" + 0.048*"kill" + 0.048*"power" + 0.022*"scary" + 0.000*"woman" + 0.000*"fibroid" + 0.000*"parent"'),
 (5,
  '0.122*"pain" + 0.108*"say" + 0.100*"tell" + 0.071*"sometimes" + 0.051*"heart" + 0.045*"other" + 0.039*"remember" + 0.035*"maybe" + 0.035*"away" + 0.

Topics still too "dispersed". Will use other paremeters to find more "solid" topics

## Fine tuning

In [None]:
# Build LDA model
lda_model2 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model2.print_topics()


#### With 5 topics:
- body
- disease (fibroid)
- diet advice / what to eat and what not
- positive / action verbs e.g. make, go, etc
- workout / program 

In [None]:
# tuning trying different passes, etc

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.print_topics()

#### 5 topics
- body, help with body issues? food and exercise
- disease
- recipes
- action verbs
- workout program

In [None]:
# keeping nouns only

from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)
            

In [None]:
data_nouns = instagram["post"].apply(str).apply(nouns)
data_nouns[:10]

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_nouns)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams)

print(data_lemmatized[:1])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:


lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.print_topics()


#### Topics if only accounting for nouns (above)
- food, female topics (period, estrogen)
- woman, leadership
- recipe?
- disease, medical procedures
- symptom/pain


In [None]:
# reducing chunksize

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.print_topics()


### Topics when chunkside is reduced (above)
- procedure, treatment
- woman, life
- fruit, plann
- disease, symptom
- day, food


In [None]:
# chunkside back to 100, passes 50, 4 topics

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.print_topics()



### with 4 topics
- woman, food, weight, ageing
- day, year, goal - probably resolutions
- ?
- disease, procedure

In [31]:
# 3 topics
lda_model3 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=50,
                                           alpha='auto',
                                           per_word_topics=True)

In [32]:
lda_model3.print_topics()


[(0,
  '0.015*"make" + 0.013*"time" + 0.012*"day" + 0.012*"get" + 0.011*"want" + 0.010*"go" + 0.010*"know" + 0.010*"healthy" + 0.010*"take" + 0.008*"feel"'),
 (1,
  '0.044*"fibroid" + 0.027*"woman" + 0.017*"patient" + 0.016*"symptom" + 0.012*"procedure" + 0.012*"center" + 0.011*"ufe" + 0.010*"surgery" + 0.010*"cause" + 0.009*"adenomyosis"'),
 (2,
  '0.015*"water" + 0.012*"add" + 0.010*"breakfast" + 0.010*"use" + 0.009*"food" + 0.009*"drink" + 0.008*"eat" + 0.008*"oil" + 0.008*"high" + 0.008*"protein"')]

### 3 topics (above)

- diet/food: water, recipe, protein
- resolution: the healthy life, decisions, woman
- diseases: fibroid, woman, procedure, treatment, pain, opinion



### Conclusion
Afer experimenting with different parameters, 3 topics seems to give us a solid idea of the different existent topics founds in the instagram posts. 
This might need to be changed/ further "tuned" if more data becomes available.


In [33]:
type(lda_model3)

gensim.models.ldamodel.LdaModel