## LSA & LDA 

___(With stop words)___

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [2]:
pd.set_option('display.max_colwidth', -1)

_________________________________________________

#### Import pickled stop words from 
`Effects of Stop Words Elimination for Arabic Information Retrieval: A Comparative Study`

In [3]:
stop_words_df = pd.read_pickle('/home/jovyan/capstone-52/topic_modeling_experiments/pickled_stopwords/comp_study_stopwords.p')

In [4]:
stop_words_df.columns = ["stop_words"]

In [5]:
stop_words_df.columns

Index(['stop_words'], dtype='object')

In [6]:
iabuelkhair_stopwords = stop_words_df['stop_words'].tolist()

In [7]:
iabuelkhair_stopwords[:4]

['انها', 'اثناء', 'اجل', 'احدا']

### Access corpus through pickled MongoDB file

In [8]:
cd ../../Pickled_from_mongo/

/home/jovyan/capstone-52/Pickled_from_mongo


In [41]:
df = pd.read_pickle('../Pickled_from_mongo/combined_eg_gulf_200k_sample.p')

In [42]:
df.sample(2)

Unnamed: 0,_id,cleaned_geo,cleaned_name,cleaned_text,class
71581,5a2cd263204c9e0400ced87f,,rehamamr97,انا من رأيي علشان نستفز امريكا بجد نروح ندعم كوريا الشماليه في الصاروخ ال بتهدد بيه امريكا,EG
59375,5a2cb974204c9e0400cea8d1,Egypt,Dohaaaaaa3,يا صباح الخرا,EG


In [43]:
df = df.drop(['_id', 'cleaned_geo', 'cleaned_name'], axis=1)

In [44]:
df = df.drop_duplicates(['cleaned_text'], keep=False)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179632 entries, 0 to 95683
Data columns (total 2 columns):
cleaned_text    179632 non-null object
class           179632 non-null object
dtypes: object(2)
memory usage: 4.1+ MB


## Benchmark LSA with stop words

### Label Encode the Categories


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
df['class_numerical'] = le.fit_transform(df['class'])

### TFIDF

### Prepare Document Term Matrix


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words= iabuelkhair_stopwords)

In [None]:
document_term_matrix_sps = tfidf_vectorizer.fit_transform(df.cleaned_text)

In [None]:
document_term_matrix_sps


### Compute SVD of Document Term Matrix

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
n_components = 50
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [None]:
svd_matrix = SVD.fit_transform(document_term_matrix_sps)

In [None]:
SVD.explained_variance_ratio_

In [None]:
svd_matrix.shape

### Load SVD Matrix with Documents and Labels


In [None]:
latent_semantic_analysis = pd.DataFrame(svd_matrix,
                                        index=df.index,
                                        columns=component_names)
latent_semantic_analysis['cleaned_text'] = df.cleaned_text
latent_semantic_analysis['class'] = df['class']

In [None]:
latent_semantic_analysis.head()


In [None]:
vocabulary_loadings = pd.DataFrame(SVD.components_,
                                   index=component_names,
                                   columns=tfidf_vectorizer.get_feature_names()).T

In [None]:
vocabulary_loadings['abs_component_1'] = np.abs(vocabulary_loadings.component_1)
vocabulary_loadings['abs_component_2'] = np.abs(vocabulary_loadings.component_2)

### Display Top Terms for Each Component

In [None]:
vocabulary_loadings.sort_values('abs_component_1',ascending=False).head(10)

In [None]:
vocabulary_loadings.sort_values('abs_component_2',ascending=False).head(10)

In [None]:
plt.figure(figsize=(7,7))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,1)
plt.ylim(-.5,1)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

strings = df['cleaned_text'].values
for i, (x, y) in enumerate(zip(pc_1, pc_2)): 
    plt.text(x,y,strings[i][:10])

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,1)
plt.ylim(-.1,1)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.01,.5)
plt.ylim(-.3,.6)

In [None]:
eg_mask = latent_semantic_analysis['class'] == 'EG'

In [None]:
latent_semantic_analysis[eg_mask][:5]

In [None]:
gulf_mask = latent_semantic_analysis['class'] == 'GULF'

In [None]:
latent_semantic_analysis[gulf_mask][:5]

In [None]:
latent_semantic_analysis[(latent_semantic_analysis['class'] == 'EG') 
                         & (latent_semantic_analysis.component_2 > .050)][:5]

In [None]:
latent_semantic_analysis[(latent_semantic_analysis['class'] == 'GULF') 
                         & (latent_semantic_analysis.component_2 > .50)][:5]

## GENSIM

In [None]:
#!pip install -U gensim

In [66]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [70]:
import nltk
import gensim
from nltk.text import Text  
from gensim import models, corpora, similarities

In [22]:
df.sample(2)

Unnamed: 0,cleaned_text,class
34836,الحمدلله من قبل ومن بعد بكره عمليه اخرى لي ونقول ي رب عفوك وتوفيقك ، دعواتكم,GULF
36026,مشاء الله الساعه,EG


In [46]:
df["cleaned_text"] = df["cleaned_text"].map(lambda x: x.split(' '))

In [24]:
df["cleaned_text"].sample(2)

23482    [علي, كدا, روان, دي, ظروفها, ايه]                                                                                                                         
34106    [امرنا, الله, بالامتثال, لاوامر, الوالدين, مادام, ليس, بها, معصية, للخالق،, وانا, شخصيا, م, اذكر, اني, خالفت, اهلي, بشي, ومشيت, ع, رأيي, الا, واتسحف, وا…]
Name: cleaned_text, dtype: object

In [25]:
len(df["cleaned_text"])

179632

In [26]:
len(iabuelkhair_stopwords)

1590

In [27]:
iabuelkhair_stopwords[:3]

['انها', 'اثناء', 'اجل']

#### Removing stop words from tokens

In [39]:
df["cleaned_text"] = df["cleaned_text"].map([lambda x: if x not in iabuelkhair_stopwords])

SyntaxError: invalid syntax (<ipython-input-39-d4c4634cd332>, line 1)

In [33]:
len(df["cleaned_text"])

179632

In [40]:
df["cleaned_text"][:3]

0    True
1    True
2    True
Name: cleaned_text, dtype: bool

#### See how many words were removed

In [None]:
def stopword_percentage(text_eda, text_eda_no_stops):
    increase = (len(text_eda_no_stops) - len(text_eda))
    return (increase / len(text_eda)) * 100

In [None]:
stopword_percentage(text_eda, text_eda_no_stops)

#### Pass `df["cleaned_text"]` to gensim's Dictionary:

In [47]:
dictionary = corpora.Dictionary(df["cleaned_text"])

#### Use filter_extremes method to eliminate the 5000 most frequent words

In [48]:
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=5000)

#### Compactify data to make up for any missing labels

In [49]:
dictionary.compactify()

#### Use doc2bow method to get bag of words representation (word_id, frequency) 

In [50]:
corpus = [dictionary.doc2bow(text) for text in df["cleaned_text"]]

In [None]:
#dictionary.doc2bow(dictionary, allow_update=True, return_missing=False)

#### Save corpus in Market Matrix format 

In [51]:
pwd

'/home/jovyan/capstone-52/Pickled_from_mongo'

In [58]:
cd market_matrix_files/

/home/jovyan/capstone-52/market_matrix_files


In [60]:
corpora.MmCorpus.serialize('../market_matrix_files/corpus_test.mm', corpus)

#### Load corpus iterator from Matrix Market file

In [None]:
corpus = corpora.MmCorpus('/tmp/corpus.mm')

#### Explore the transformed corpus

In [62]:
print(corpus[2])

[(1, 1), (25, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)]


## LDA with stopwords

In [67]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary)

2017-12-16 05:03:44,476 : INFO : using symmetric alpha at 0.3333333333333333
2017-12-16 05:03:44,477 : INFO : using symmetric eta at 0.3333333333333333
2017-12-16 05:03:44,479 : INFO : using serial LDA version on this node
2017-12-16 05:03:44,713 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 179632 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-16 05:03:44,715 : INFO : PROGRESS: pass 0, at document #2000/179632
2017-12-16 05:03:49,174 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:03:49,216 : INFO : topic #0 (0.333): 0.033*"في" + 0.026*"من" + 0.025*"يا" + 0.021*"اللي" + 0.018*"مش" + 0.016*"ما" + 0.014*"كل" + 0.013*"و" + 0.012*"على" + 0.011*"أزاي"
2017-12-16 05:03:49,219 : INFO : topic #1 (0.333): 0.029*"لا" + 0.024*"من" + 0.019*"ايه" + 0.018*"في" + 0.014*"بس" + 0.012*"كده" + 0.

2017-12-16 05:04:15,093 : INFO : topic #2 (0.333): 0.052*"انا" + 0.025*"مش" + 0.017*"في" + 0.017*"من" + 0.015*"بس" + 0.014*"حد" + 0.014*"لا" + 0.013*"،" + 0.011*"دا" + 0.010*"عشان"
2017-12-16 05:04:15,095 : INFO : topic diff=0.361279, rho=0.316228
2017-12-16 05:04:15,097 : INFO : PROGRESS: pass 0, at document #22000/179632
2017-12-16 05:04:17,156 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:04:17,198 : INFO : topic #0 (0.333): 0.114*"كدا" + 0.051*"و" + 0.024*"يا" + 0.022*"من" + 0.019*"في" + 0.018*"مش" + 0.017*"اللي" + 0.014*"كل" + 0.013*"ما" + 0.013*"،"
2017-12-16 05:04:17,201 : INFO : topic #1 (0.333): 0.024*"من" + 0.021*"لا" + 0.021*"ف" + 0.018*"ايه" + 0.015*"ولا" + 0.012*"في" + 0.012*"بس" + 0.011*"ع" + 0.010*"غير" + 0.010*"أنا"
2017-12-16 05:04:17,203 : INFO : topic #2 (0.333): 0.053*"انا" + 0.026*"مش" + 0.017*"من" + 0.017*"في" + 0.016*"بس" + 0.015*"دا" + 0.013*"لا" + 0.012*"حد" + 0.012*"عشان" + 0.010*"،"
2017-12-16 05:04:17,205 : INFO 

2017-12-16 05:04:37,275 : INFO : topic diff=0.282918, rho=0.223607
2017-12-16 05:04:37,277 : INFO : PROGRESS: pass 0, at document #42000/179632
2017-12-16 05:04:39,097 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:04:39,149 : INFO : topic #0 (0.333): 0.050*"و" + 0.041*"يا" + 0.039*"كدا" + 0.019*"من" + 0.018*"اللي" + 0.016*"كل" + 0.014*"في" + 0.014*"مش" + 0.014*"انت" + 0.013*"والله"
2017-12-16 05:04:39,152 : INFO : topic #1 (0.333): 0.027*"لا" + 0.027*"ف" + 0.026*"ايه" + 0.024*"من" + 0.017*"ولا" + 0.012*"في" + 0.011*""" + 0.011*"الله" + 0.010*"أنا" + 0.010*"دي"
2017-12-16 05:04:39,155 : INFO : topic #2 (0.333): 0.066*"انا" + 0.032*"مش" + 0.020*"حد" + 0.019*"دا" + 0.017*"ف" + 0.017*"بس" + 0.015*"من" + 0.013*"في" + 0.012*"لا" + 0.012*"والله"
2017-12-16 05:04:39,157 : INFO : topic diff=0.215783, rho=0.218218
2017-12-16 05:04:39,159 : INFO : PROGRESS: pass 0, at document #44000/179632
2017-12-16 05:04:41,082 : INFO : merging changes from 2000 do

2017-12-16 05:05:00,347 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:00,388 : INFO : topic #0 (0.333): 0.055*"و" + 0.049*"يا" + 0.021*"اللي" + 0.020*"في" + 0.020*"من" + 0.013*"انت" + 0.013*"كل" + 0.013*"بتاع" + 0.012*"ده" + 0.012*"مش"
2017-12-16 05:05:00,391 : INFO : topic #1 (0.333): 0.030*"من" + 0.023*"لا" + 0.023*"فى" + 0.021*"ايه" + 0.019*"أنا" + 0.018*"في" + 0.017*"ولا" + 0.014*"ده" + 0.013*"اللى" + 0.011*"على"
2017-12-16 05:05:00,393 : INFO : topic #2 (0.333): 0.048*"انا" + 0.037*"مش" + 0.018*"بس" + 0.016*"حد" + 0.016*"من" + 0.015*"في" + 0.013*"بتاع" + 0.013*"عشان" + 0.012*"والله" + 0.011*"لا"
2017-12-16 05:05:00,396 : INFO : topic diff=0.169418, rho=0.179605
2017-12-16 05:05:00,398 : INFO : PROGRESS: pass 0, at document #64000/179632
2017-12-16 05:05:02,123 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:02,162 : INFO : topic #0 (0.333): 0.062*"و" + 0.048*"يا" + 0.022*"اللي" + 0.022*

2017-12-16 05:05:20,200 : INFO : topic #0 (0.333): 0.050*"يا" + 0.042*"و" + 0.022*"اللي" + 0.021*"في" + 0.019*"من" + 0.017*"كل" + 0.014*"انت" + 0.014*"ما" + 0.011*"ربنا" + 0.010*"مش"
2017-12-16 05:05:20,203 : INFO : topic #1 (0.333): 0.036*"من" + 0.026*"لا" + 0.021*"ايه" + 0.020*"في" + 0.019*"فى" + 0.016*"ولا" + 0.015*"الله" + 0.014*"ف" + 0.012*""" + 0.011*"اللى"
2017-12-16 05:05:20,205 : INFO : topic #2 (0.333): 0.047*"انا" + 0.039*"مش" + 0.019*"بس" + 0.017*"من" + 0.016*"حد" + 0.016*"في" + 0.012*"والله" + 0.012*"لما" + 0.011*"لو" + 0.010*"لا"
2017-12-16 05:05:20,207 : INFO : topic diff=0.167897, rho=0.156174
2017-12-16 05:05:20,210 : INFO : PROGRESS: pass 0, at document #84000/179632
2017-12-16 05:05:21,967 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:22,004 : INFO : topic #0 (0.333): 0.049*"يا" + 0.040*"و" + 0.023*"اللي" + 0.021*"في" + 0.018*"من" + 0.016*"كل" + 0.014*"انت" + 0.013*"ما" + 0.012*"ربنا" + 0.010*"والله"
2017-12-16 05:05:2

2017-12-16 05:05:37,965 : INFO : topic #1 (0.333): 0.046*"من" + 0.027*"لا" + 0.025*"في" + 0.021*"على" + 0.019*"الله" + 0.015*"،" + 0.012*"ولا" + 0.012*"ما" + 0.012*"️" + 0.010*"أن"
2017-12-16 05:05:37,968 : INFO : topic #2 (0.333): 0.036*"انا" + 0.021*"مش" + 0.020*"بس" + 0.018*"من" + 0.018*"في" + 0.011*"والله" + 0.010*"اللي" + 0.009*"لا" + 0.009*"حد" + 0.009*"لو"
2017-12-16 05:05:37,970 : INFO : topic diff=0.179880, rho=0.140028
2017-12-16 05:05:37,972 : INFO : PROGRESS: pass 0, at document #104000/179632
2017-12-16 05:05:39,479 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:39,513 : INFO : topic #0 (0.333): 0.063*"و" + 0.053*"يا" + 0.022*"في" + 0.019*"من" + 0.018*"اللي" + 0.017*"ما" + 0.016*"كل" + 0.012*"انت" + 0.012*"الله" + 0.008*"يارب"
2017-12-16 05:05:39,516 : INFO : topic #1 (0.333): 0.049*"من" + 0.028*"في" + 0.024*"لا" + 0.023*"على" + 0.018*"الله" + 0.014*"،" + 0.012*"️" + 0.010*"…" + 0.010*"ولا" + 0.010*"ما"
2017-12-16 05:05:39,51

2017-12-16 05:05:54,905 : INFO : topic #2 (0.333): 0.024*"بس" + 0.023*"انا" + 0.018*"من" + 0.014*"في" + 0.012*"لي" + 0.011*"اللي" + 0.010*"اي" + 0.010*"اني" + 0.009*"لو" + 0.009*"ما"
2017-12-16 05:05:54,907 : INFO : topic diff=0.111011, rho=0.128037
2017-12-16 05:05:54,909 : INFO : PROGRESS: pass 0, at document #124000/179632
2017-12-16 05:05:56,261 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:56,294 : INFO : topic #0 (0.333): 0.066*"و" + 0.026*"يا" + 0.020*"من" + 0.019*"كل" + 0.018*"في" + 0.017*"انت" + 0.017*"الله" + 0.017*"اللي" + 0.015*"ما" + 0.014*"الي"
2017-12-16 05:05:56,297 : INFO : topic #1 (0.333): 0.046*"من" + 0.029*"في" + 0.026*"لا" + 0.022*"على" + 0.021*"الله" + 0.015*"،" + 0.015*"أن" + 0.011*"عن" + 0.011*"ولا" + 0.010*"ما"
2017-12-16 05:05:56,299 : INFO : topic #2 (0.333): 0.023*"انا" + 0.022*"بس" + 0.019*"من" + 0.014*"في" + 0.014*"لي" + 0.011*"اللي" + 0.010*"اني" + 0.010*"لو" + 0.010*"اي" + 0.009*"ما"
2017-12-16 05:05:56,3

2017-12-16 05:06:10,684 : INFO : topic diff=0.091005, rho=0.118678
2017-12-16 05:06:10,686 : INFO : PROGRESS: pass 0, at document #144000/179632
2017-12-16 05:06:12,014 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:06:12,045 : INFO : topic #0 (0.333): 0.036*"و" + 0.023*"من" + 0.020*"يا" + 0.019*"اللي" + 0.018*"الله" + 0.017*"انت" + 0.016*"في" + 0.014*"كل" + 0.013*"الي" + 0.012*"ما"
2017-12-16 05:06:12,048 : INFO : topic #1 (0.333): 0.050*"من" + 0.035*"في" + 0.023*"لا" + 0.023*"على" + 0.020*"،" + 0.019*"الله" + 0.010*"هذا" + 0.010*"عن" + 0.010*"ولا" + 0.010*"…"
2017-12-16 05:06:12,050 : INFO : topic #2 (0.333): 0.029*"انا" + 0.021*"بس" + 0.019*"من" + 0.014*"ع" + 0.014*"والله" + 0.013*"لي" + 0.013*"اللي" + 0.012*"لو" + 0.012*"في" + 0.012*"اي"
2017-12-16 05:06:12,052 : INFO : topic diff=0.081464, rho=0.117851
2017-12-16 05:06:12,055 : INFO : PROGRESS: pass 0, at document #146000/179632
2017-12-16 05:06:13,320 : INFO : merging changes from 2000

2017-12-16 05:06:27,464 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:06:27,498 : INFO : topic #0 (0.333): 0.043*"يا" + 0.043*"و" + 0.024*"الله" + 0.021*"من" + 0.018*"ما" + 0.014*"اللي" + 0.014*"في" + 0.013*"كل" + 0.011*"انت" + 0.011*"مو"
2017-12-16 05:06:27,501 : INFO : topic #1 (0.333): 0.047*"من" + 0.044*"في" + 0.026*"على" + 0.024*"لا" + 0.019*"الله" + 0.015*"،" + 0.014*"…" + 0.012*"أن" + 0.011*"عن" + 0.010*"ما"
2017-12-16 05:06:27,503 : INFO : topic #2 (0.333): 0.025*"انا" + 0.019*"بس" + 0.018*"من" + 0.012*"صباح" + 0.012*"والله" + 0.012*"ما" + 0.012*"لا" + 0.011*"لي" + 0.011*"في" + 0.011*"اللي"
2017-12-16 05:06:27,506 : INFO : topic diff=0.110466, rho=0.110432
2017-12-16 05:06:27,508 : INFO : PROGRESS: pass 0, at document #166000/179632
2017-12-16 05:06:28,703 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:06:28,735 : INFO : topic #0 (0.333): 0.043*"يا" + 0.042*"و" + 0.024*"الله" + 0.021*"من"

In [69]:
print(ldamodel.print_topics(num_topics=3, num_words=2))

2017-12-16 05:09:39,137 : INFO : topic #0 (0.333): 0.055*"و" + 0.032*"يا"
2017-12-16 05:09:39,138 : INFO : topic #1 (0.333): 0.048*"من" + 0.041*"في"
2017-12-16 05:09:39,139 : INFO : topic #2 (0.333): 0.024*"انا" + 0.019*"من"


[(0, '0.055*"و" + 0.032*"يا"'), (1, '0.048*"من" + 0.041*"في"'), (2, '0.024*"انا" + 0.019*"من"')]


For later use

In [None]:
# class MyCorpus(object):
#     def __iter__(self):
#         for line in open('datasets/mycorpus.txt'):
#             # assume there's one document per line, tokens separated by whitespace
#             yield dictionary.doc2bow(line.lower().split())

In [None]:
# add a new doc (tokens) to this dictionary and update it."Same tokens to the same ids and new tokens to new ids".
# dict2 = corpora.Dictionary(moreDocs)
# dict1.merge_with(dict2)

In [None]:
# new_vec = dictionary.doc2bow(text_eda_no_stops)
# print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored
# [(0, 1), (1, 1)]

In [None]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = Dictionary(text_eda_no_stops)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in text_eda_no_stops]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [None]:
#tfidf = models.TfidfModel(df.cleaned_text)

In [None]:
#corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)