## LSA & LDA 

___(With stop words)___

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [2]:
pd.set_option('display.max_colwidth', -1)

_________________________________________________

#### Import pickled stop words from 
`Effects of Stop Words Elimination for Arabic Information Retrieval: A Comparative Study`

In [3]:
stop_words_df = pd.read_pickle('/home/jovyan/capstone-52/topic_modeling_experiments/pickled_stopwords/comp_study_stopwords.p')

In [4]:
stop_words_df.columns = ["stop_words"]

In [5]:
stop_words_df.columns

Index(['stop_words'], dtype='object')

In [6]:
iabuelkhair_stopwords = stop_words_df['stop_words'].tolist()

In [7]:
iabuelkhair_stopwords[:4]

['انها', 'اثناء', 'اجل', 'احدا']

### Access corpus through pickled MongoDB file

In [8]:
cd ../../Pickled_from_mongo/

/home/jovyan/capstone-52/Pickled_from_mongo


In [41]:
df = pd.read_pickle('../Pickled_from_mongo/combined_eg_gulf_200k_sample.p')

In [42]:
df.sample(2)

Unnamed: 0,_id,cleaned_geo,cleaned_name,cleaned_text,class
71581,5a2cd263204c9e0400ced87f,,rehamamr97,انا من رأيي علشان نستفز امريكا بجد نروح ندعم كوريا الشماليه في الصاروخ ال بتهدد بيه امريكا,EG
59375,5a2cb974204c9e0400cea8d1,Egypt,Dohaaaaaa3,يا صباح الخرا,EG


In [43]:
df = df.drop(['_id', 'cleaned_geo', 'cleaned_name'], axis=1)

In [44]:
df = df.drop_duplicates(['cleaned_text'], keep=False)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179632 entries, 0 to 95683
Data columns (total 2 columns):
cleaned_text    179632 non-null object
class           179632 non-null object
dtypes: object(2)
memory usage: 4.1+ MB


## Benchmark LSA with stop words

### Label Encode the Categories


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
df['class_numerical'] = le.fit_transform(df['class'])

### TFIDF

### Prepare Document Term Matrix


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words= iabuelkhair_stopwords)

In [None]:
document_term_matrix_sps = tfidf_vectorizer.fit_transform(df.cleaned_text)

In [None]:
document_term_matrix_sps


### Compute SVD of Document Term Matrix

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
n_components = 50
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [None]:
svd_matrix = SVD.fit_transform(document_term_matrix_sps)

In [None]:
SVD.explained_variance_ratio_

In [None]:
svd_matrix.shape

### Load SVD Matrix with Documents and Labels


In [None]:
latent_semantic_analysis = pd.DataFrame(svd_matrix,
                                        index=df.index,
                                        columns=component_names)
latent_semantic_analysis['cleaned_text'] = df.cleaned_text
latent_semantic_analysis['class'] = df['class']

In [None]:
latent_semantic_analysis.head()


In [None]:
vocabulary_loadings = pd.DataFrame(SVD.components_,
                                   index=component_names,
                                   columns=tfidf_vectorizer.get_feature_names()).T

In [None]:
vocabulary_loadings['abs_component_1'] = np.abs(vocabulary_loadings.component_1)
vocabulary_loadings['abs_component_2'] = np.abs(vocabulary_loadings.component_2)

### Display Top Terms for Each Component

In [None]:
vocabulary_loadings.sort_values('abs_component_1',ascending=False).head(10)

In [None]:
vocabulary_loadings.sort_values('abs_component_2',ascending=False).head(10)

In [None]:
plt.figure(figsize=(7,7))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,1)
plt.ylim(-.5,1)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

strings = df['cleaned_text'].values
for i, (x, y) in enumerate(zip(pc_1, pc_2)): 
    plt.text(x,y,strings[i][:10])

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,1)
plt.ylim(-.1,1)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.01,.5)
plt.ylim(-.3,.6)

In [None]:
eg_mask = latent_semantic_analysis['class'] == 'EG'

In [None]:
latent_semantic_analysis[eg_mask][:5]

In [None]:
gulf_mask = latent_semantic_analysis['class'] == 'GULF'

In [None]:
latent_semantic_analysis[gulf_mask][:5]

In [None]:
latent_semantic_analysis[(latent_semantic_analysis['class'] == 'EG') 
                         & (latent_semantic_analysis.component_2 > .050)][:5]

In [None]:
latent_semantic_analysis[(latent_semantic_analysis['class'] == 'GULF') 
                         & (latent_semantic_analysis.component_2 > .50)][:5]

## GENSIM

In [None]:
#!pip install -U gensim

In [66]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [70]:
import nltk
import gensim
from nltk.text import Text  
from gensim import models, corpora, similarities

In [22]:
df.sample(2)

Unnamed: 0,cleaned_text,class
34836,الحمدلله من قبل ومن بعد بكره عمليه اخرى لي ونقول ي رب عفوك وتوفيقك ، دعواتكم,GULF
36026,مشاء الله الساعه,EG


In [46]:
df["cleaned_text"] = df["cleaned_text"].map(lambda x: x.split(' '))

In [24]:
df["cleaned_text"].sample(2)

23482    [علي, كدا, روان, دي, ظروفها, ايه]                                                                                                                         
34106    [امرنا, الله, بالامتثال, لاوامر, الوالدين, مادام, ليس, بها, معصية, للخالق،, وانا, شخصيا, م, اذكر, اني, خالفت, اهلي, بشي, ومشيت, ع, رأيي, الا, واتسحف, وا…]
Name: cleaned_text, dtype: object

In [25]:
len(df["cleaned_text"])

179632

In [135]:
len(iabuelkhair_stopwords)

1590

In [27]:
iabuelkhair_stopwords[:3]

['انها', 'اثناء', 'اجل']

#### Removing stop words from tokens

In [125]:
df["cleaned_text"][:4]

0    [أزاي, أقول, لك, كنا, زمان, والماضي, كان, فى, الغيب, بكره, واللي, أحنا, فيه, دلوقت, كمان, ح, يفوت, علينا, ولا, ندري]                           
1    [هي, آراء, آه, بس, أزاي, أجويرو, منتهي, يعني, أمال, لو, مكنش, الهداف, التاريخي, للسيتي, و, كل, موسم, اقل, عدد, من, الاهداف…]                   
2    [أنت, صيني, أزاي, تقارن, شادي, بالخطيب, ألي, هو, الوحيد, ألي, خد, الكورة, الذهبية, من, مصر, وبشهادت, الزملكاوي, قبل, الأ…]                     
4    [التوينز, اللي, معاها, كل, الحلو, والوحش, والمصايب, والاكل, والفشل, مش, عارفه, من, غيرك, كنت, هعيش, أزاي, ربنا, يخليكي, ليا, ومش, تويته, توصفك]
Name: cleaned_text, dtype: object

In [129]:
texts = [[word for word in document if word not in iabuelkhair_stopwords] for document in df['cleaned_text']]

In [133]:
print(texts[:3])

[['أزاي', 'أقول', 'زمان', 'والماضي', 'الغيب', 'بكره', 'واللي', 'أحنا', 'دلوقت', 'كمان', 'ح', 'يفوت', 'ولا', 'ندري'], ['آراء', 'آه', 'بس', 'أزاي', 'أجويرو', 'منتهي', 'يعني', 'أمال', 'مكنش', 'الهداف', 'التاريخي', 'للسيتي', 'و', 'موسم', 'اقل', 'الاهداف…'], ['صيني', 'أزاي', 'تقارن', 'شادي', 'بالخطيب', 'ألي', 'الوحيد', 'ألي', 'خد', 'الكورة', 'الذهبية', 'مصر', 'وبشهادت', 'الزملكاوي', 'الأ…']]


In [134]:
len(texts)

179632

In [39]:
df["cleaned_text"] = df["cleaned_text"].map([lambda x: if x not in iabuelkhair_stopwords])

SyntaxError: invalid syntax (<ipython-input-39-d4c4634cd332>, line 1)

In [33]:
len(df["cleaned_text"])

179632

In [113]:
df["cleaned_text"][:3]

0    [أزاي, أقول, لك, كنا, زمان, والماضي, كان, فى, الغيب, بكره, واللي, أحنا, فيه, دلوقت, كمان, ح, يفوت, علينا, ولا, ندري]        
1    [هي, آراء, آه, بس, أزاي, أجويرو, منتهي, يعني, أمال, لو, مكنش, الهداف, التاريخي, للسيتي, و, كل, موسم, اقل, عدد, من, الاهداف…]
2    [أنت, صيني, أزاي, تقارن, شادي, بالخطيب, ألي, هو, الوحيد, ألي, خد, الكورة, الذهبية, من, مصر, وبشهادت, الزملكاوي, قبل, الأ…]  
Name: cleaned_text, dtype: object

#### See how many words were removed

In [None]:
def stopword_percentage(text_eda, text_eda_no_stops):
    increase = (len(text_eda_no_stops) - len(text_eda))
    return (increase / len(text_eda)) * 100

In [None]:
stopword_percentage(text_eda, text_eda_no_stops)

#### Pass `df["cleaned_text"]` with gensim's Dictionary representation:

In [47]:
dictionary = corpora.Dictionary(df["cleaned_text"])

#### Use filter_extremes method to eliminate the 5000 most frequent words

In [48]:
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=5000)

#### Alternatively, 

In [None]:
# # Remove rare and common tokens.
# # Filter out words that occur too frequently or too rarely.
# max_freq = 0.5
# min_wordcount = 20
# dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# _ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

#### Compactify data to make up for any missing labels

In [49]:
dictionary.compactify()

#### Use doc2bow method to get bag of words representation (word_id, frequency) 

In [50]:
corpus = [dictionary.doc2bow(text) for text in df["cleaned_text"]]

In [None]:
#dictionary.doc2bow(dictionary, allow_update=True, return_missing=False)

#### Save corpus in Market Matrix format 

In [51]:
pwd

'/home/jovyan/capstone-52/Pickled_from_mongo'

In [58]:
cd market_matrix_files/

/home/jovyan/capstone-52/market_matrix_files


In [60]:
corpora.MmCorpus.serialize('../market_matrix_files/corpus_test.mm', corpus)

#### Load corpus iterator from Matrix Market file

In [None]:
corpus = corpora.MmCorpus('../market_matrix_files/corpus_test.mm')

#### Explore the transformed corpus

In [62]:
print(corpus[2])

[(1, 1), (25, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)]


## LDA with stopwords

#### Initialize LDA (can only use bow with LDA)

In [67]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary)

2017-12-16 05:03:44,476 : INFO : using symmetric alpha at 0.3333333333333333
2017-12-16 05:03:44,477 : INFO : using symmetric eta at 0.3333333333333333
2017-12-16 05:03:44,479 : INFO : using serial LDA version on this node
2017-12-16 05:03:44,713 : INFO : running online (single-pass) LDA training, 3 topics, 1 passes over the supplied corpus of 179632 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-16 05:03:44,715 : INFO : PROGRESS: pass 0, at document #2000/179632
2017-12-16 05:03:49,174 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:03:49,216 : INFO : topic #0 (0.333): 0.033*"في" + 0.026*"من" + 0.025*"يا" + 0.021*"اللي" + 0.018*"مش" + 0.016*"ما" + 0.014*"كل" + 0.013*"و" + 0.012*"على" + 0.011*"أزاي"
2017-12-16 05:03:49,219 : INFO : topic #1 (0.333): 0.029*"لا" + 0.024*"من" + 0.019*"ايه" + 0.018*"في" + 0.014*"بس" + 0.012*"كده" + 0.

2017-12-16 05:04:15,093 : INFO : topic #2 (0.333): 0.052*"انا" + 0.025*"مش" + 0.017*"في" + 0.017*"من" + 0.015*"بس" + 0.014*"حد" + 0.014*"لا" + 0.013*"،" + 0.011*"دا" + 0.010*"عشان"
2017-12-16 05:04:15,095 : INFO : topic diff=0.361279, rho=0.316228
2017-12-16 05:04:15,097 : INFO : PROGRESS: pass 0, at document #22000/179632
2017-12-16 05:04:17,156 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:04:17,198 : INFO : topic #0 (0.333): 0.114*"كدا" + 0.051*"و" + 0.024*"يا" + 0.022*"من" + 0.019*"في" + 0.018*"مش" + 0.017*"اللي" + 0.014*"كل" + 0.013*"ما" + 0.013*"،"
2017-12-16 05:04:17,201 : INFO : topic #1 (0.333): 0.024*"من" + 0.021*"لا" + 0.021*"ف" + 0.018*"ايه" + 0.015*"ولا" + 0.012*"في" + 0.012*"بس" + 0.011*"ع" + 0.010*"غير" + 0.010*"أنا"
2017-12-16 05:04:17,203 : INFO : topic #2 (0.333): 0.053*"انا" + 0.026*"مش" + 0.017*"من" + 0.017*"في" + 0.016*"بس" + 0.015*"دا" + 0.013*"لا" + 0.012*"حد" + 0.012*"عشان" + 0.010*"،"
2017-12-16 05:04:17,205 : INFO 

2017-12-16 05:04:37,275 : INFO : topic diff=0.282918, rho=0.223607
2017-12-16 05:04:37,277 : INFO : PROGRESS: pass 0, at document #42000/179632
2017-12-16 05:04:39,097 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:04:39,149 : INFO : topic #0 (0.333): 0.050*"و" + 0.041*"يا" + 0.039*"كدا" + 0.019*"من" + 0.018*"اللي" + 0.016*"كل" + 0.014*"في" + 0.014*"مش" + 0.014*"انت" + 0.013*"والله"
2017-12-16 05:04:39,152 : INFO : topic #1 (0.333): 0.027*"لا" + 0.027*"ف" + 0.026*"ايه" + 0.024*"من" + 0.017*"ولا" + 0.012*"في" + 0.011*""" + 0.011*"الله" + 0.010*"أنا" + 0.010*"دي"
2017-12-16 05:04:39,155 : INFO : topic #2 (0.333): 0.066*"انا" + 0.032*"مش" + 0.020*"حد" + 0.019*"دا" + 0.017*"ف" + 0.017*"بس" + 0.015*"من" + 0.013*"في" + 0.012*"لا" + 0.012*"والله"
2017-12-16 05:04:39,157 : INFO : topic diff=0.215783, rho=0.218218
2017-12-16 05:04:39,159 : INFO : PROGRESS: pass 0, at document #44000/179632
2017-12-16 05:04:41,082 : INFO : merging changes from 2000 do

2017-12-16 05:05:00,347 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:00,388 : INFO : topic #0 (0.333): 0.055*"و" + 0.049*"يا" + 0.021*"اللي" + 0.020*"في" + 0.020*"من" + 0.013*"انت" + 0.013*"كل" + 0.013*"بتاع" + 0.012*"ده" + 0.012*"مش"
2017-12-16 05:05:00,391 : INFO : topic #1 (0.333): 0.030*"من" + 0.023*"لا" + 0.023*"فى" + 0.021*"ايه" + 0.019*"أنا" + 0.018*"في" + 0.017*"ولا" + 0.014*"ده" + 0.013*"اللى" + 0.011*"على"
2017-12-16 05:05:00,393 : INFO : topic #2 (0.333): 0.048*"انا" + 0.037*"مش" + 0.018*"بس" + 0.016*"حد" + 0.016*"من" + 0.015*"في" + 0.013*"بتاع" + 0.013*"عشان" + 0.012*"والله" + 0.011*"لا"
2017-12-16 05:05:00,396 : INFO : topic diff=0.169418, rho=0.179605
2017-12-16 05:05:00,398 : INFO : PROGRESS: pass 0, at document #64000/179632
2017-12-16 05:05:02,123 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:02,162 : INFO : topic #0 (0.333): 0.062*"و" + 0.048*"يا" + 0.022*"اللي" + 0.022*

2017-12-16 05:05:20,200 : INFO : topic #0 (0.333): 0.050*"يا" + 0.042*"و" + 0.022*"اللي" + 0.021*"في" + 0.019*"من" + 0.017*"كل" + 0.014*"انت" + 0.014*"ما" + 0.011*"ربنا" + 0.010*"مش"
2017-12-16 05:05:20,203 : INFO : topic #1 (0.333): 0.036*"من" + 0.026*"لا" + 0.021*"ايه" + 0.020*"في" + 0.019*"فى" + 0.016*"ولا" + 0.015*"الله" + 0.014*"ف" + 0.012*""" + 0.011*"اللى"
2017-12-16 05:05:20,205 : INFO : topic #2 (0.333): 0.047*"انا" + 0.039*"مش" + 0.019*"بس" + 0.017*"من" + 0.016*"حد" + 0.016*"في" + 0.012*"والله" + 0.012*"لما" + 0.011*"لو" + 0.010*"لا"
2017-12-16 05:05:20,207 : INFO : topic diff=0.167897, rho=0.156174
2017-12-16 05:05:20,210 : INFO : PROGRESS: pass 0, at document #84000/179632
2017-12-16 05:05:21,967 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:22,004 : INFO : topic #0 (0.333): 0.049*"يا" + 0.040*"و" + 0.023*"اللي" + 0.021*"في" + 0.018*"من" + 0.016*"كل" + 0.014*"انت" + 0.013*"ما" + 0.012*"ربنا" + 0.010*"والله"
2017-12-16 05:05:2

2017-12-16 05:05:37,965 : INFO : topic #1 (0.333): 0.046*"من" + 0.027*"لا" + 0.025*"في" + 0.021*"على" + 0.019*"الله" + 0.015*"،" + 0.012*"ولا" + 0.012*"ما" + 0.012*"️" + 0.010*"أن"
2017-12-16 05:05:37,968 : INFO : topic #2 (0.333): 0.036*"انا" + 0.021*"مش" + 0.020*"بس" + 0.018*"من" + 0.018*"في" + 0.011*"والله" + 0.010*"اللي" + 0.009*"لا" + 0.009*"حد" + 0.009*"لو"
2017-12-16 05:05:37,970 : INFO : topic diff=0.179880, rho=0.140028
2017-12-16 05:05:37,972 : INFO : PROGRESS: pass 0, at document #104000/179632
2017-12-16 05:05:39,479 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:39,513 : INFO : topic #0 (0.333): 0.063*"و" + 0.053*"يا" + 0.022*"في" + 0.019*"من" + 0.018*"اللي" + 0.017*"ما" + 0.016*"كل" + 0.012*"انت" + 0.012*"الله" + 0.008*"يارب"
2017-12-16 05:05:39,516 : INFO : topic #1 (0.333): 0.049*"من" + 0.028*"في" + 0.024*"لا" + 0.023*"على" + 0.018*"الله" + 0.014*"،" + 0.012*"️" + 0.010*"…" + 0.010*"ولا" + 0.010*"ما"
2017-12-16 05:05:39,51

2017-12-16 05:05:54,905 : INFO : topic #2 (0.333): 0.024*"بس" + 0.023*"انا" + 0.018*"من" + 0.014*"في" + 0.012*"لي" + 0.011*"اللي" + 0.010*"اي" + 0.010*"اني" + 0.009*"لو" + 0.009*"ما"
2017-12-16 05:05:54,907 : INFO : topic diff=0.111011, rho=0.128037
2017-12-16 05:05:54,909 : INFO : PROGRESS: pass 0, at document #124000/179632
2017-12-16 05:05:56,261 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:05:56,294 : INFO : topic #0 (0.333): 0.066*"و" + 0.026*"يا" + 0.020*"من" + 0.019*"كل" + 0.018*"في" + 0.017*"انت" + 0.017*"الله" + 0.017*"اللي" + 0.015*"ما" + 0.014*"الي"
2017-12-16 05:05:56,297 : INFO : topic #1 (0.333): 0.046*"من" + 0.029*"في" + 0.026*"لا" + 0.022*"على" + 0.021*"الله" + 0.015*"،" + 0.015*"أن" + 0.011*"عن" + 0.011*"ولا" + 0.010*"ما"
2017-12-16 05:05:56,299 : INFO : topic #2 (0.333): 0.023*"انا" + 0.022*"بس" + 0.019*"من" + 0.014*"في" + 0.014*"لي" + 0.011*"اللي" + 0.010*"اني" + 0.010*"لو" + 0.010*"اي" + 0.009*"ما"
2017-12-16 05:05:56,3

2017-12-16 05:06:10,684 : INFO : topic diff=0.091005, rho=0.118678
2017-12-16 05:06:10,686 : INFO : PROGRESS: pass 0, at document #144000/179632
2017-12-16 05:06:12,014 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:06:12,045 : INFO : topic #0 (0.333): 0.036*"و" + 0.023*"من" + 0.020*"يا" + 0.019*"اللي" + 0.018*"الله" + 0.017*"انت" + 0.016*"في" + 0.014*"كل" + 0.013*"الي" + 0.012*"ما"
2017-12-16 05:06:12,048 : INFO : topic #1 (0.333): 0.050*"من" + 0.035*"في" + 0.023*"لا" + 0.023*"على" + 0.020*"،" + 0.019*"الله" + 0.010*"هذا" + 0.010*"عن" + 0.010*"ولا" + 0.010*"…"
2017-12-16 05:06:12,050 : INFO : topic #2 (0.333): 0.029*"انا" + 0.021*"بس" + 0.019*"من" + 0.014*"ع" + 0.014*"والله" + 0.013*"لي" + 0.013*"اللي" + 0.012*"لو" + 0.012*"في" + 0.012*"اي"
2017-12-16 05:06:12,052 : INFO : topic diff=0.081464, rho=0.117851
2017-12-16 05:06:12,055 : INFO : PROGRESS: pass 0, at document #146000/179632
2017-12-16 05:06:13,320 : INFO : merging changes from 2000

2017-12-16 05:06:27,464 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:06:27,498 : INFO : topic #0 (0.333): 0.043*"يا" + 0.043*"و" + 0.024*"الله" + 0.021*"من" + 0.018*"ما" + 0.014*"اللي" + 0.014*"في" + 0.013*"كل" + 0.011*"انت" + 0.011*"مو"
2017-12-16 05:06:27,501 : INFO : topic #1 (0.333): 0.047*"من" + 0.044*"في" + 0.026*"على" + 0.024*"لا" + 0.019*"الله" + 0.015*"،" + 0.014*"…" + 0.012*"أن" + 0.011*"عن" + 0.010*"ما"
2017-12-16 05:06:27,503 : INFO : topic #2 (0.333): 0.025*"انا" + 0.019*"بس" + 0.018*"من" + 0.012*"صباح" + 0.012*"والله" + 0.012*"ما" + 0.012*"لا" + 0.011*"لي" + 0.011*"في" + 0.011*"اللي"
2017-12-16 05:06:27,506 : INFO : topic diff=0.110466, rho=0.110432
2017-12-16 05:06:27,508 : INFO : PROGRESS: pass 0, at document #166000/179632
2017-12-16 05:06:28,703 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 05:06:28,735 : INFO : topic #0 (0.333): 0.043*"يا" + 0.042*"و" + 0.024*"الله" + 0.021*"من"

#### Print topics and the top terms associated with them 

In [69]:
print(ldamodel.print_topics(num_topics=50, num_words=3))

2017-12-16 05:09:39,137 : INFO : topic #0 (0.333): 0.055*"و" + 0.032*"يا"
2017-12-16 05:09:39,138 : INFO : topic #1 (0.333): 0.048*"من" + 0.041*"في"
2017-12-16 05:09:39,139 : INFO : topic #2 (0.333): 0.024*"انا" + 0.019*"من"


[(0, '0.055*"و" + 0.032*"يا"'), (1, '0.048*"من" + 0.041*"في"'), (2, '0.024*"انا" + 0.019*"من"')]


#### Save the LDA model

In [71]:
ldamodel.save('../topic_modeling_experiments/lsa_lda/model_lda.pkl') 
#lda = models.LdaModel.load('../topic_modeling_experiments/lsa_lda/model_lda.pkl')

2017-12-16 05:36:45,937 : INFO : saving LdaState object under ../topic_modeling_experiments/lsa_lda/model.lda.state, separately None
2017-12-16 05:36:45,943 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.lda.state
2017-12-16 05:36:45,950 : INFO : saving LdaModel object under ../topic_modeling_experiments/lsa_lda/model.lda, separately ['expElogbeta', 'sstats']
2017-12-16 05:36:45,951 : INFO : storing np array 'expElogbeta' to ../topic_modeling_experiments/lsa_lda/model.lda.expElogbeta.npy
2017-12-16 05:36:45,953 : INFO : not storing attribute dispatcher
2017-12-16 05:36:45,954 : INFO : not storing attribute state
2017-12-16 05:36:45,954 : INFO : not storing attribute id2word
2017-12-16 05:36:45,956 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.lda


#### Initialize tfidf to use with LSA

In [77]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

2017-12-16 06:07:18,380 : INFO : collecting document frequencies
2017-12-16 06:07:18,381 : INFO : PROGRESS: processing document #0
2017-12-16 06:07:18,414 : INFO : PROGRESS: processing document #10000
2017-12-16 06:07:18,446 : INFO : PROGRESS: processing document #20000
2017-12-16 06:07:18,468 : INFO : PROGRESS: processing document #30000
2017-12-16 06:07:18,487 : INFO : PROGRESS: processing document #40000
2017-12-16 06:07:18,508 : INFO : PROGRESS: processing document #50000
2017-12-16 06:07:18,527 : INFO : PROGRESS: processing document #60000
2017-12-16 06:07:18,547 : INFO : PROGRESS: processing document #70000
2017-12-16 06:07:18,566 : INFO : PROGRESS: processing document #80000
2017-12-16 06:07:18,585 : INFO : PROGRESS: processing document #90000
2017-12-16 06:07:18,604 : INFO : PROGRESS: processing document #100000
2017-12-16 06:07:18,624 : INFO : PROGRESS: processing document #110000
2017-12-16 06:07:18,644 : INFO : PROGRESS: processing document #120000
2017-12-16 06:07:18,668 : 

#### Use the model to transform vectors

In [79]:
doc_bow = [(0, 2), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.9029959618571598), (1, 0.42964903452662695)]


#### Save the tfidf model

In [80]:
tfidf.save('../topic_modeling_experiments/lsa_lda/model_tfidf.pkl') 
#tfidf = models.LsiModel.load('../topic_modeling_experiments/lsa_lda/model_tfidf.pkl')

2017-12-16 06:07:52,869 : INFO : saving TfidfModel object under ../topic_modeling_experiments/lsa_lda/model.tfidf, separately None
2017-12-16 06:07:52,871 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.tfidf


#### Apply a transformation to the whole tfidf transformed corpus

In [117]:
corpus_tfidf = tfidf[corpus]

In [140]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

2017-12-16 21:14:26,080 : INFO : using serial LSI version on this node
2017-12-16 21:14:26,082 : INFO : updating model with new documents
2017-12-16 21:14:26,425 : INFO : preparing a new chunk of documents
2017-12-16 21:14:26,526 : INFO : using 100 extra samples and 2 power iterations
2017-12-16 21:14:26,532 : INFO : 1st phase: constructing (5000, 600) action matrix
2017-12-16 21:14:27,231 : INFO : orthonormalizing (5000, 600) action matrix
2017-12-16 21:14:29,215 : INFO : 2nd phase: running dense svd on (600, 20000) matrix
2017-12-16 21:14:33,146 : INFO : computing the final decomposition
2017-12-16 21:14:33,153 : INFO : keeping 500 factors (discarding 5.697% of energy spectrum)
2017-12-16 21:14:33,283 : INFO : processed documents up to #20000
2017-12-16 21:14:33,286 : INFO : topic #0(16.672): 0.422*"و" + 0.284*"مش" + 0.270*"،" + 0.268*"يا" + 0.202*"ده" + 0.197*"في" + 0.193*"انا" + 0.175*"اللي" + 0.171*"من" + 0.152*"بس"
2017-12-16 21:14:33,289 : INFO : topic #1(12.675): 0.631*"،" + -0

2017-12-16 21:15:10,025 : INFO : topic #4(23.188): -0.768*"انا" + 0.366*"مش" + -0.242*"و" + 0.205*"لا" + 0.134*"ايه" + 0.128*"في" + 0.127*"ده" + -0.122*"كدا" + 0.112*"ولا" + 0.100*"دي"
2017-12-16 21:15:10,396 : INFO : preparing a new chunk of documents
2017-12-16 21:15:10,489 : INFO : using 100 extra samples and 2 power iterations
2017-12-16 21:15:10,494 : INFO : 1st phase: constructing (5000, 600) action matrix
2017-12-16 21:15:11,194 : INFO : orthonormalizing (5000, 600) action matrix
2017-12-16 21:15:13,177 : INFO : 2nd phase: running dense svd on (600, 20000) matrix
2017-12-16 21:15:17,408 : INFO : computing the final decomposition
2017-12-16 21:15:17,415 : INFO : keeping 500 factors (discarding 6.306% of energy spectrum)
2017-12-16 21:15:17,535 : INFO : merging projections: (5000, 500) + (5000, 500)
2017-12-16 21:15:19,151 : INFO : keeping 500 factors (discarding 7.657% of energy spectrum)
2017-12-16 21:15:19,375 : INFO : processed documents up to #120000
2017-12-16 21:15:19,383 :

#### Print results

In [137]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [141]:
pp.pprint(lsi.print_topics(500))

2017-12-16 21:15:58,024 : INFO : topic #0(41.518): 0.343*"و" + 0.319*"من" + 0.314*"يا" + 0.271*"لا" + 0.251*"في" + 0.250*"انا" + 0.217*"الله" + 0.170*"ما" + 0.163*"مش" + 0.154*"بس"
2017-12-16 21:15:58,025 : INFO : topic #1(33.439): -0.922*"يا" + 0.187*"و" + 0.176*"لا" + 0.159*"من" + 0.101*"في" + 0.093*"انا" + 0.054*"،" + 0.048*"اللي" + -0.046*"الله" + 0.042*"ما"
2017-12-16 21:15:58,026 : INFO : topic #2(30.922): -0.606*"الله" + 0.556*"انا" + -0.241*"من" + 0.199*"مش" + 0.175*"كدا" + 0.173*"بس" + 0.170*"والله" + -0.125*"في" + -0.106*"على" + -0.105*"لا"
2017-12-16 21:15:58,032 : INFO : topic #3(30.542): 0.796*"و" + -0.548*"لا" + -0.148*"من" + -0.114*"ولا" + -0.094*"انا" + -0.064*"والله" + -0.058*"بس" + 0.034*"الله" + -0.034*"على" + -0.033*"مش"
2017-12-16 21:15:58,033 : INFO : topic #4(29.819): 0.732*"لا" + -0.406*"من" + 0.401*"و" + -0.205*"في" + -0.113*"والله" + -0.104*"انا" + -0.095*"الله" + 0.094*"يا" + -0.083*"اللي" + -0.072*"على"
2017-12-16 21:15:58,034 : INFO : topic #5(29.029): -0.7

2017-12-16 21:15:58,108 : INFO : topic #44(17.113): -0.654*"اللهم" + -0.300*"الناس" + -0.248*"فى" + 0.240*"أن" + 0.215*"هذا" + -0.208*"ربنا" + 0.175*"دا" + -0.168*"الي" + 0.128*"يارب" + 0.128*"ان"
2017-12-16 21:15:58,109 : INFO : topic #45(17.020): 0.588*"ربنا" + -0.486*"حد" + 0.277*"أن" + -0.204*"الي" + 0.188*"فى" + 0.169*"الناس" + 0.167*"هذا" + -0.146*"عن" + 0.138*"…" + -0.137*"أنا"
2017-12-16 21:15:58,110 : INFO : topic #46(16.973): 0.534*"حد" + 0.435*"هذا" + 0.277*"أنا" + 0.274*"فى" + 0.263*"أن" + -0.222*"لما" + -0.179*"ان" + -0.137*"هو" + 0.131*"ربنا" + -0.130*"عن"
2017-12-16 21:15:58,120 : INFO : topic #47(16.850): 0.489*"الناس" + 0.413*"أن" + -0.399*"ربنا" + -0.367*"هذا" + 0.186*"أنا" + 0.167*"قلبي" + 0.162*"فى" + -0.145*"طيب" + -0.134*"حد" + -0.121*"عن"
2017-12-16 21:15:58,121 : INFO : topic #48(16.775): -0.514*"فى" + 0.409*"ربنا" + -0.284*"وانا" + 0.275*"الي" + 0.273*"أن" + -0.242*"هذا" + 0.175*"حد" + -0.161*"…" + -0.149*"طيب" + 0.136*"عليك"
2017-12-16 21:15:58,122 : INFO : to

2017-12-16 21:15:58,200 : INFO : topic #86(13.850): 0.711*"ناس" + 0.319*"كتير" + -0.292*"اني" + -0.188*"قبل" + -0.171*"بالله" + 0.158*"عادي" + 0.132*"جدا" + 0.126*"الا" + -0.123*"فيه" + 0.114*"او"
2017-12-16 21:15:58,201 : INFO : topic #87(13.763): -0.543*"نفسي" + 0.387*"اني" + 0.339*"ناس" + -0.336*"اذا" + -0.230*"عادي" + -0.228*"جدا" + 0.184*"قبل" + -0.170*"او" + -0.152*"ممكن" + 0.126*"كان"
2017-12-16 21:15:58,202 : INFO : topic #88(13.747): -0.814*"قبل" + -0.364*"م" + 0.203*"اني" + -0.201*"نفسي" + 0.108*"(" + -0.082*"عادي" + 0.079*"شي" + -0.077*"طب" + 0.069*"حتى" + -0.069*"ناس"
2017-12-16 21:15:58,203 : INFO : topic #89(13.708): 0.833*"عليه" + 0.243*"جدا" + 0.210*"كمان" + -0.207*"او" + -0.126*"اذا" + 0.116*"عادي" + -0.115*"بالله" + -0.108*"حلو" + -0.101*"(" + -0.087*"حتى"
2017-12-16 21:15:58,204 : INFO : topic #90(13.685): 0.583*"جدا" + -0.348*"عليه" + -0.348*"اذا" + 0.301*"عادي" + -0.268*"نفسي" + 0.262*"كمان" + -0.178*"ناس" + -0.131*"حاجة" + -0.118*"حلو" + 0.115*"ممكن"
2017-12-16 21

2017-12-16 21:15:58,267 : INFO : topic #128(12.195): -0.392*"علشان" + 0.367*"الوكر" + -0.360*"وش" + 0.304*"فولورز" + 0.265*"الشتا" + 0.253*"بقى" + 0.226*"محمد" + 0.201*"الدنيا" + -0.147*"فين" + 0.138*"الخير"
2017-12-16 21:15:58,268 : INFO : topic #129(12.161): -0.489*"الدنيا" + 0.355*"مفيش" + 0.303*"الخير" + -0.236*"منك" + 0.233*"زي" + 0.200*"الوكر" + -0.179*"صباح" + -0.169*"عايز" + 0.165*"فولورز" + -0.150*"محمد"
2017-12-16 21:15:58,269 : INFO : topic #130(12.148): 0.481*"منك" + -0.410*"وش" + 0.388*"مفيش" + -0.270*"علشان" + 0.244*"ليش" + -0.219*"الدنيا" + -0.190*"الوكر" + 0.170*"*" + -0.157*"فولورز" + -0.138*"الشتا"
2017-12-16 21:15:58,270 : INFO : topic #131(12.137): -0.516*"مصر" + 0.338*"*" + 0.326*"الدنيا" + 0.288*"العالم" + 0.260*"مفيش" + 0.255*"علشان" + -0.167*"له" + -0.140*"القلب" + 0.136*"زي" + 0.135*"بجد"
2017-12-16 21:15:58,271 : INFO : topic #132(12.085): 0.638*"*" + -0.286*"له" + -0.278*"منك" + 0.278*"بين" + 0.242*"مصر" + 0.210*"وش" + -0.182*"عايز" + -0.155*"العالم" + -0.132

2017-12-16 21:15:58,309 : INFO : topic #169(10.967): 0.540*"علينا" + 0.290*"يكون" + 0.247*"قد" + -0.232*"احلي" + -0.210*"هذي" + -0.183*"انه" + -0.180*"كلام" + -0.169*"الى" + 0.157*"️️" + -0.143*"إلا"
2017-12-16 21:15:58,310 : INFO : topic #170(10.939): 0.505*"خالص" + 0.417*"دول" + 0.330*"لكن" + -0.284*"الى" + 0.249*"نفس" + 0.245*"قطر" + 0.242*"احلى" + -0.132*"حبيبي" + -0.118*"شنو" + 0.109*"أو"
2017-12-16 21:15:58,311 : INFO : topic #171(10.916): 0.524*"حبيبي" + 0.347*"احلي" + -0.331*"كلها" + 0.290*"يكون" + 0.257*"قد" + -0.203*"️️" + -0.183*"حياتي" + -0.132*"الف" + 0.110*"هذي" + -0.109*"جميل"
2017-12-16 21:15:58,312 : INFO : topic #172(10.887): -0.736*"خالص" + 0.279*"لكن" + 0.220*"احلى" + 0.212*"قطر" + 0.168*"حلوه" + -0.163*"اهو" + -0.137*"الى" + -0.134*"شنو" + -0.131*"️️" + -0.128*"حبيبي"
2017-12-16 21:15:58,313 : INFO : topic #173(10.872): 0.540*"حبيبي" + 0.388*"️️" + -0.383*"علينا" + 0.234*"نفس" + -0.202*"احلي" + -0.166*"شيء" + 0.163*"كلها" + -0.133*"أو" + 0.129*"يكون" + 0.128*"ابو"


2017-12-16 21:15:58,397 : INFO : topic #208(10.086): -0.405*"معايا" + -0.345*"مره" + 0.332*"تويتر" + 0.314*"محدش" + 0.242*"عندك" + 0.234*"منه" + 0.224*"حق" + 0.195*"يمكن" + 0.163*"عم" + 0.145*"اول"
2017-12-16 21:15:58,398 : INFO : topic #209(10.071): 0.587*"ب" + 0.464*"بنت" + 0.259*"منه" + 0.178*"معايا" + -0.173*"فعلا" + 0.158*"لنا" + 0.149*"حلوة" + -0.147*"عندك" + 0.133*"اكيد" + -0.123*"الف"
2017-12-16 21:15:58,399 : INFO : topic #210(10.041): 0.371*"الحياة" + -0.364*"منه" + -0.364*"فعلا" + 0.336*"عندك" + 0.269*"حق" + -0.225*"أنت" + 0.218*"حب" + 0.201*"حلوة" + 0.187*"معايا" + 0.141*"اكيد"
2017-12-16 21:15:58,408 : INFO : topic #211(10.015): -0.339*"بنت" + 0.316*"حب" + -0.267*"اكيد" + -0.261*"الذي" + 0.257*"منه" + 0.225*"عمري" + 0.177*"طول" + 0.176*"حلوة" + 0.167*"الحمدلله" + 0.167*"راح"
2017-12-16 21:15:58,409 : INFO : topic #212(10.008): -0.500*"بنت" + 0.388*"ب" + 0.260*"الذي" + 0.231*"عم" + -0.227*"عمري" + 0.201*"معايا" + 0.192*"رب" + -0.181*"طول" + 0.147*"حب" + -0.137*"الحمدلله"
20

2017-12-16 21:15:58,652 : INFO : topic #250(9.011): 0.450*"مرة" + 0.391*"قلب" + 0.293*"الواحد" + -0.293*"معلش" + -0.267*"⁦️⁩" + 0.199*"شوية" + -0.154*"كلنا" + 0.145*"اكثر" + -0.139*"شخص" + 0.123*"عايزة"
2017-12-16 21:15:58,656 : INFO : topic #251(8.989): -0.470*"ربي" + -0.364*"شوية" + 0.333*"قلب" + 0.248*"معلش" + 0.182*"يسعد" + -0.159*"اكثر" + -0.157*"ريتويت" + -0.151*"كلنا" + -0.150*"بحب" + -0.130*"عليهم"
2017-12-16 21:15:58,657 : INFO : topic #252(8.975): 0.748*"⁦️⁩" + -0.288*"شوية" + 0.243*"مرة" + 0.236*"اكثر" + 0.234*"الواحد" + 0.134*"معلش" + 0.108*"ربي" + 0.102*"طول" + -0.089*"شخص" + 0.086*"اخر"
2017-12-16 21:15:58,660 : INFO : topic #253(8.969): 0.516*"⁦️⁩" + 0.453*"شوية" + -0.385*"معلش" + -0.263*"اكثر" + -0.248*"عاد" + 0.176*"قلب" + -0.154*"ربي" + 0.126*"يسعد" + -0.120*"الواحد" + -0.095*"صباحك"
2017-12-16 21:15:58,661 : INFO : topic #254(8.915): 0.457*"اكثر" + 0.443*"عاد" + -0.247*"شويه" + 0.243*"شوية" + -0.216*"الواحد" + 0.201*"زين" + -0.186*"التي" + -0.182*"ريتويت" + -0.175*"م

2017-12-16 21:15:58,737 : INFO : topic #291(8.279): -0.309*"هههههه" + 0.284*"انتو" + 0.254*"زمان" + 0.236*"اجل" + -0.229*"اخر" + -0.211*"ابي" + -0.198*"عيني" + 0.196*"•" + -0.181*"ههههه" + 0.170*"جماعه"
2017-12-16 21:15:58,738 : INFO : topic #292(8.259): -0.310*"تم" + 0.303*"زمان" + -0.270*"عيني" + -0.216*"جماعه" + -0.207*"مكان" + 0.202*"وانتي" + -0.183*"بخير" + -0.177*"ههههه" + -0.172*"إذا" + 0.157*"بـ"
2017-12-16 21:15:58,739 : INFO : topic #293(8.240): -0.365*"شلون" + 0.363*"هههههه" + -0.226*"مكان" + -0.214*"حرام" + -0.181*"وانتي" + 0.179*"إذا" + 0.173*"بخير" + 0.173*"ايش" + -0.172*"بلا" + -0.170*"بكل"
2017-12-16 21:15:58,741 : INFO : topic #294(8.218): 0.288*"عني" + -0.274*"حرام" + -0.192*"ابن" + -0.188*"ههههه" + 0.182*"لن" + 0.172*"يبقي" + 0.163*"الاتحاد" + 0.161*"ومن" + -0.154*"ياريت" + 0.153*"الجو"
2017-12-16 21:15:58,742 : INFO : topic #295(8.202): -0.323*"والفولورز" + -0.316*"بـ" + 0.255*"عيني" + 0.246*"فولورز" + -0.202*"الوكر" + 0.201*"ههههه" + 0.186*"الشتا" + -0.185*"تم" + -

2017-12-16 21:15:58,805 : INFO : topic #331(7.655): -0.495*"شر" + 0.351*"كفايه" + -0.185*"منهم" + 0.178*"وجهك" + 0.158*"مسا" + 0.155*"بلاش" + 0.148*"انها" + 0.146*"فولو" + 0.142*"؛" + -0.138*"معاه"
2017-12-16 21:15:58,806 : INFO : topic #332(7.623): 0.326*"انها" + 0.239*"أي" + -0.212*"بقيت" + -0.208*"اوى" + -0.198*"كفايه" + 0.198*"بدون" + 0.194*"ياخي" + -0.184*"الوقت" + -0.174*"ولله" + -0.173*"ثم"
2017-12-16 21:15:58,807 : INFO : topic #333(7.607): -0.434*"كفايه" + 0.275*"سنة" + 0.241*"؛" + 0.201*"كام" + -0.178*"شر" + 0.153*"تقريبا" + -0.152*"ماما" + -0.151*"يابني" + -0.141*"ولله" + -0.130*"بـ"
2017-12-16 21:15:58,808 : INFO : topic #334(7.587): -0.246*"ماما" + 0.227*"روحي" + -0.222*"ولو" + 0.218*"الصبح" + 0.213*"منهم" + -0.201*"تقول" + 0.195*"عندنا" + 0.163*"وربنا" + -0.158*"بلا" + -0.155*"اقول"
2017-12-16 21:15:58,812 : INFO : topic #335(7.568): -0.406*"بقيت" + 0.277*"ولاد" + 0.265*"عندنا" + -0.224*"؛" + -0.222*"معاه" + 0.183*"عنك" + -0.146*"ياخي" + 0.136*"كفاية" + 0.134*"ولو" + -0.1

2017-12-16 21:15:58,874 : INFO : topic #372(7.102): -0.293*"حياتك" + -0.226*"بيه" + 0.216*"البنات" + 0.201*"كوباية" + -0.189*"كويس" + 0.189*"عليكي" + -0.184*"صلاح" + 0.178*"يابني" + 0.157*"لـ" + 0.156*"الاول"
2017-12-16 21:15:58,875 : INFO : topic #373(7.084): 0.259*"منا" + -0.232*"الاول" + 0.213*"برضو" + 0.205*"انام" + 0.197*"كوباية" + 0.197*"صلاح" + -0.164*"انتوا" + -0.149*"هم" + -0.138*"كُل" + -0.126*"كويس"
2017-12-16 21:15:58,876 : INFO : topic #374(7.077): 0.386*"اها" + -0.291*"كوباية" + 0.215*"البنات" + 0.208*"برضو" + -0.182*"منو" + -0.168*"غلط" + 0.155*"تقول" + 0.155*"حياتك" + 0.152*"كسم" + 0.143*"صلاح"
2017-12-16 21:15:58,877 : INFO : topic #375(7.063): -0.603*"برضو" + 0.232*"حياتك" + 0.202*"الموضوع" + 0.184*"صلاح" + -0.151*"كُل" + 0.148*"|" + 0.130*"🤦‍️" + 0.107*"منا" + 0.099*"معاه" + 0.095*"لـ"
2017-12-16 21:15:58,878 : INFO : topic #376(7.047): -0.338*"عليكي" + -0.296*"كويس" + 0.204*"انتى" + 0.196*"الاول" + 0.182*"الموضوع" + 0.177*"انام" + 0.174*"طلعت" + -0.160*"ايوا" + 0.14

2017-12-16 21:15:58,946 : INFO : topic #412(6.607): -0.185*"تمام" + 0.177*"عارفة" + -0.161*"النهاردة" + -0.155*"باين" + 0.154*"مِن" + 0.151*"الـ" + -0.146*"امي" + 0.140*"صالح" + -0.134*"دايما" + 0.131*"نص"
2017-12-16 21:15:58,947 : INFO : topic #413(6.598): -0.386*"كسم" + 0.301*"عايزه" + -0.204*"واحدة" + 0.194*"زى" + -0.176*"بيها" + 0.156*"حياتك" + 0.150*"تعالي" + 0.141*"عمرو" + 0.140*"ينفع" + -0.134*"بقول"
2017-12-16 21:15:58,948 : INFO : topic #414(6.563): -0.283*"إني" + 0.253*"أيه" + 0.245*"خرا" + -0.199*"احمد" + 0.185*"عايزه" + 0.172*"مافي" + -0.166*"تعالي" + 0.141*"امي" + 0.138*"بلوك" + 0.132*"عنده"
2017-12-16 21:15:58,949 : INFO : topic #415(6.557): 0.327*"تمام" + -0.212*"زى" + 0.204*"أحبك" + 0.159*"تانى" + 0.157*"بقول" + 0.156*"حاضر" + -0.154*"ترا" + -0.154*"مثلا" + -0.152*"حصل" + 0.150*"تعالي"
2017-12-16 21:15:58,950 : INFO : topic #416(6.544): 0.213*"بلوك" + -0.202*"بالك" + 0.190*"عنده" + -0.187*"قلت" + -0.176*"أيه" + -0.143*"خد" + 0.137*"الموضوع" + 0.134*"طلع" + -0.132*"خرا" 

2017-12-16 21:15:59,012 : INFO : topic #451(6.186): 0.211*"التقاعد" + 0.209*"انتا" + 0.200*"سن" + -0.198*"تانى" + 0.192*"بقت" + 0.191*"ينفع" + 0.185*"النهاردة" + -0.161*"معرفش" + 0.159*"مطلب" + 0.156*"شعبي"
2017-12-16 21:15:59,013 : INFO : topic #452(6.169): -0.331*"نيك" + 0.242*"هوا" + -0.229*"اية" + -0.165*"فاهم" + 0.160*"طلع" + 0.148*"عايزين" + -0.139*"حصل" + 0.137*"عارفة" + 0.130*"مغسلة" + 0.117*"اسم"
2017-12-16 21:15:59,015 : INFO : topic #453(6.144): 0.216*"نيك" + 0.193*"ليا" + 0.188*"عايزه" + -0.177*"هوا" + -0.161*"طلع" + -0.146*"اية" + -0.145*"حبيبى" + 0.143*"اما" + -0.135*"قول" + -0.127*"زى"
2017-12-16 21:15:59,016 : INFO : topic #454(6.142): 0.208*"عارفة" + -0.207*"تانى" + 0.167*"نيك" + 0.164*"حصل" + 0.159*"مغسلة" + 0.150*"طلع" + 0.145*"مالك" + -0.139*"الشتا" + 0.138*"اللَّهَ" + -0.131*"امبارح"
2017-12-16 21:15:59,017 : INFO : topic #455(6.118): -0.285*"مغسلة" + 0.271*"امتي" + -0.210*"اكل" + -0.173*"يَ" + -0.166*"ينفع" + -0.158*"مالك" + -0.151*"البرد" + -0.150*"ها" + -0.145*"

2017-12-16 21:15:59,079 : INFO : topic #491(5.664): -0.253*"اللهُمَ" + -0.216*"الكراش" + 0.190*"للدعم" + -0.181*"أزاي" + -0.176*"الوطني" + -0.154*"رسالة" + 0.141*"اليمني" + -0.134*"فيلم" + 0.134*"احا" + -0.133*"فاكر"
2017-12-16 21:15:59,080 : INFO : topic #492(5.653): -0.222*"عيد" + 0.187*"اللهُمَ" + -0.156*"انى" + -0.137*"احا" + 0.131*"الخطيب" + 0.130*"وحش" + 0.130*"محمود" + 0.127*"عبد" + 0.125*"امبارح" + -0.122*"بفولورز"
2017-12-16 21:15:59,081 : INFO : topic #493(5.638): -0.333*"أزاي" + -0.241*"هى" + 0.224*"ناديك" + -0.206*"لوحدي" + -0.181*"اللَّهِ" + -0.176*"أكتر" + 0.165*"وحش" + 0.151*"اللَّهُ" + 0.140*"النهارده" + -0.135*"هههه"
2017-12-16 21:15:59,087 : INFO : topic #494(5.633): -0.425*"أزاي" + 0.294*"هى" + 0.217*"اللهُمَ" + 0.175*"هههه" + -0.147*"لوحدي" + 0.143*"انى" + 0.133*"للدعم" + -0.129*"الوطني" + -0.114*"احا" + 0.111*"فاكر"
2017-12-16 21:15:59,088 : INFO : topic #495(5.613): -0.260*"اللَّهُ" + -0.254*"اللهُمَ" + 0.235*"بيبي" + 0.234*"النهارده" + 0.224*"ناديك" + -0.166*"رسا

[   (   0,
        '0.343*"و" + 0.319*"من" + 0.314*"يا" + 0.271*"لا" + 0.251*"في" + '
        '0.250*"انا" + 0.217*"الله" + 0.170*"ما" + 0.163*"مش" + 0.154*"بس"'),
    (   1,
        '-0.922*"يا" + 0.187*"و" + 0.176*"لا" + 0.159*"من" + 0.101*"في" + '
        '0.093*"انا" + 0.054*"،" + 0.048*"اللي" + -0.046*"الله" + 0.042*"ما"'),
    (   2,
        '-0.606*"الله" + 0.556*"انا" + -0.241*"من" + 0.199*"مش" + 0.175*"كدا" '
        '+ 0.173*"بس" + 0.170*"والله" + -0.125*"في" + -0.106*"على" + '
        '-0.105*"لا"'),
    (   3,
        '0.796*"و" + -0.548*"لا" + -0.148*"من" + -0.114*"ولا" + -0.094*"انا" + '
        '-0.064*"والله" + -0.058*"بس" + 0.034*"الله" + -0.034*"على" + '
        '-0.033*"مش"'),
    (   4,
        '0.732*"لا" + -0.406*"من" + 0.401*"و" + -0.205*"في" + -0.113*"والله" + '
        '-0.104*"انا" + -0.095*"الله" + 0.094*"يا" + -0.083*"اللي" + '
        '-0.072*"على"'),
    (   5,
        '-0.723*"الله" + 0.419*"من" + -0.407*"انا" + 0.191*"في" + 0.133*"يا" + '
        '-0.125

    (   180,
        '-0.435*"اهو" + -0.338*"دلوقتي" + 0.323*"️️" + -0.306*"الف" + '
        '-0.287*"حلوه" + 0.237*"لكن" + -0.227*"لحد" + -0.209*"مبروك" + '
        '-0.199*"شيء" + 0.179*"كلام"'),
    (   181,
        '0.402*"اهو" + 0.332*"كلها" + 0.332*"️️" + -0.322*"ابو" + -0.273*"الف" '
        '+ -0.245*"ام" + -0.194*"مبروك" + 0.192*"حلوه" + 0.188*"دلوقتي" + '
        '-0.174*"حياتي"'),
    (   182,
        '0.300*"دلوقتي" + -0.281*"اهو" + 0.270*"لكن" + -0.251*"ام" + '
        '-0.247*"نفس" + 0.229*"الف" + 0.219*"يكون" + -0.201*"️️" + 0.195*"لحد" '
        '+ -0.193*"أو"'),
    (   183,
        '-0.526*"كلها" + 0.378*"دلوقتي" + 0.317*"ام" + 0.260*"لحد" + '
        '0.216*"️️" + 0.200*"لكن" + -0.199*"نفس" + 0.196*"حياتي" + 0.188*"مثل" '
        '+ 0.118*"ازاي"'),
    (   184,
        '0.546*"اهو" + -0.372*"دلوقتي" + -0.311*"لحد" + -0.244*"أو" + '
        '0.215*"مثل" + 0.207*"لكن" + 0.146*"يكون" + -0.140*"نفس" + '
        '-0.135*"قطر" + 0.133*"دول"'),
    (   185,
        '0.621*"

In [94]:
for doc in corpus_lsi[:4]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
     print(doc)

[(0, 0.05896839250044697), (1, -0.0017811316012957535)]
[(0, 0.13283087204236785), (1, -0.006330608069675509)]
[(0, 0.07651815794306846), (1, -0.0094143998768118093)]
[(0, 0.11926322708096507), (1, -0.0038942127912014041)]


Notice how 0,1 now represent `num_topics=2` (dimensions) and not `word_id` as was the case with doctobow. 

#### Save the model for later use. 

In [96]:
lsi.save('../topic_modeling_experiments/lsa_lda/model_lsi.pkl') # same for tfidf, lda, ...
#lsi = models.LsiModel.load('../topic_modeling_experiments/lsa_lda/model_lsi.pkl')

2017-12-16 06:57:53,392 : INFO : saving Projection object under ../topic_modeling_experiments/lsa_lda/model.lsi.projection, separately None
2017-12-16 06:57:53,395 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.lsi.projection
2017-12-16 06:57:53,396 : INFO : saving LsiModel object under ../topic_modeling_experiments/lsa_lda/model.lsi, separately None
2017-12-16 06:57:53,397 : INFO : not storing attribute projection
2017-12-16 06:57:53,398 : INFO : not storing attribute dispatcher
2017-12-16 06:57:53,401 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.lsi


#### Find cosine similarities between lsi topics and a query

In [97]:
index = similarities.MatrixSimilarity(corpus_lsi) # index lsi corpus

2017-12-16 07:32:16,284 : INFO : creating matrix with 179632 documents and 2 features


In [98]:
index.save('../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index')
#index = similarities.MatrixSimilarity.load('../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index')

2017-12-16 07:36:03,789 : INFO : saving MatrixSimilarity object under ../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index, separately None
2017-12-16 07:36:03,809 : INFO : saved ../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index


#### Turn query into bow and use lsi to project it onto a 2-d vector space (topics)

In [99]:
doc = "مليش دعوه بحد بصراحه ولا نادى ليه دعوه بصفقات حد السوق"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.25610017004210817), (1, -0.065892777552506498)]


In [101]:
sims = index[vec_lsi]

In [115]:
sims.shape

(179632,)

In [107]:
sims = index[vec_lsi] # perform a similarity query against the corpus
pp.pprint(list(enumerate(sims[:10]))) # print (document_number, document_similarity) 2-tuples

[   (0, 0.97553933),
    (1, 0.979222),
    (2, 0.99163806),
    (3, 0.97607386),
    (4, 0.99301887),
    (5, 0.96669626),
    (6, 0.98321593),
    (7, 0.56111413),
    (8, 0.56111413),
    (9, 0.99779427)]


The first 10 documents (tweets) are labelled with class `EG` and the query itself is written in `EG` dialect. The results show high similarity with most documents but two possible misclassifications. 

In [109]:
sims[:10]

array([ 0.97553933,  0.979222  ,  0.99163806,  0.97607386,  0.99301887,
        0.96669626,  0.98321593,  0.56111413,  0.56111413,  0.99779427], dtype=float32)

#### Find the transformation from tf-idf to latent space with Random Projections, RP

In [119]:
rp = models.RpModel(corpus_tfidf, num_topics=2)

2017-12-16 17:54:56,429 : INFO : no word id mapping provided; initializing from corpus, assuming identity
2017-12-16 17:54:59,474 : INFO : constructing (2, 5000) random matrix


In [None]:
rp.save('../topic_modeling_experiments/lsa_lda/model_rp.pkl') 
#rp = models.RpModel.load('../topic_modeling_experiments/lsa_lda/model_rp.pkl')

In [120]:
corpus_rp = rp[corpus_tfidf]

In [121]:
for doc in corpus_rp[:4]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
     print(doc)

[(0, -1.9482136964797974), (1, -0.67955482006073)]
[(0, 0.05010877549648285), (1, -0.4178534150123596)]
[(0, -1.1827576160430908), (1, 0.3307190239429474)]
[(0, -1.1580445766448975), (1, 1.347662091255188)]


In [122]:
hdp = models.HdpModel(corpus, id2word=dictionary)

2017-12-16 18:19:09,617 : INFO : (0, '0.009*، + 0.004*مش + 0.004*من + 0.002*فـ + 0.002*أنا + 0.002*بس + 0.002*في + 0.002*ده + 0.002*هو + 0.002*رسمي')
2017-12-16 18:19:09,625 : INFO : (1, '0.003*في + 0.002*مِنَ + 0.002*ضمن + 0.002*الخبر + 0.002*تعبت + 0.002*يهتم + 0.002*أنه + 0.001*كده + 0.001*حادث + 0.001*منذ')
2017-12-16 18:19:09,632 : INFO : (2, '0.002*في + 0.002*اسوأ + 0.002*أصل + 0.001*داهيه + 0.001*وصلت + 0.001*يلعن + 0.001*المشهد + 0.001*اقسم + 0.001*نيك + 0.001*فقدت')
2017-12-16 18:19:09,639 : INFO : (3, '0.002*لسا + 0.002*بني + 0.002*، + 0.002*وقعت + 0.001*أنتا + 0.001*بلدي + 0.001*“ + 0.001*عظيم + 0.001*اغير + 0.001*الساعة')
2017-12-16 18:19:09,646 : INFO : (4, '0.002*جدع + 0.002*لكم + 0.001*عندنا + 0.001*نفرح + 0.001*الحاجة + 0.001*اختار + 0.001*يناير + 0.001*دولة + 0.001*التويتر + 0.001*و')
2017-12-16 18:19:09,652 : INFO : (5, '0.002*، + 0.001*وشكرا + 0.001*في + 0.001*الخليجيه + 0.001*يا + 0.001*يبقي + 0.001*🤗 + 0.001*دي + 0.001*عشانك + 0.001*واجعل')
2017-12-16 18:19:09,659 

2017-12-16 18:20:05,400 : INFO : (0, '0.011*، + 0.010*من + 0.010*مش + 0.009*و + 0.007*انا + 0.006*كدا + 0.006*في + 0.005*بس + 0.005*اللي + 0.004*حد')
2017-12-16 18:20:05,407 : INFO : (1, '0.003*في + 0.002*مِنَ + 0.002*و + 0.002*من + 0.002*ضمن + 0.002*مش + 0.002*الخبر + 0.002*تعبت + 0.002*كده + 0.002*يهتم')
2017-12-16 18:20:05,414 : INFO : (2, '0.002*في + 0.002*من + 0.002*و + 0.002*، + 0.002*اسوأ + 0.002*أصل + 0.002*مش + 0.001*داهيه + 0.001*وصلت + 0.001*يلعن')
2017-12-16 18:20:05,421 : INFO : (3, '0.002*، + 0.002*لسا + 0.002*بني + 0.002*و + 0.002*في + 0.002*وقعت + 0.002*من + 0.001*ما + 0.001*يا + 0.001*أنتا')
2017-12-16 18:20:05,428 : INFO : (4, '0.003*، + 0.003*من + 0.002*و + 0.002*كل + 0.002*في + 0.001*ده + 0.001*يملك + 0.001*مش + 0.001*كيفك + 0.001*أن')
2017-12-16 18:20:05,435 : INFO : (5, '0.002*و + 0.002*من + 0.002*في + 0.002*جدع + 0.001*لكم + 0.001*عندنا + 0.001*نفرح + 0.001*الحاجة + 0.001*اختار + 0.001*،')
2017-12-16 18:20:05,442 : INFO : (6, '0.002*في + 0.002*، + 0.002*و + 0.002

2017-12-16 18:21:01,114 : INFO : (2, '0.003*كدا + 0.003*في + 0.002*و + 0.002*من + 0.002*مش + 0.002*، + 0.002*اسوأ + 0.002*يا + 0.002*أصل + 0.001*الله')
2017-12-16 18:21:01,121 : INFO : (3, '0.003*كدا + 0.003*من + 0.003*، + 0.003*و + 0.002*في + 0.002*مش + 0.002*كل + 0.002*يا + 0.002*انا + 0.002*لا')
2017-12-16 18:21:01,128 : INFO : (4, '0.003*كدا + 0.002*و + 0.002*، + 0.002*من + 0.002*في + 0.002*يا + 0.002*لسا + 0.002*بني + 0.002*ما + 0.002*وقعت')
2017-12-16 18:21:01,135 : INFO : (5, '0.004*كدا + 0.003*و + 0.003*من + 0.003*في + 0.002*يا + 0.002*مش + 0.002*بس + 0.002*اللي + 0.002*، + 0.002*لو')
2017-12-16 18:21:01,141 : INFO : (6, '0.003*كدا + 0.003*و + 0.003*في + 0.002*من + 0.002*لا + 0.002*اللي + 0.002*بس + 0.002*مش + 0.002*، + 0.002*"')
2017-12-16 18:21:01,148 : INFO : (7, '0.003*كدا + 0.003*من + 0.002*و + 0.002*كريم + 0.002*في + 0.002*ف + 0.002*مش + 0.002*، + 0.002*يا + 0.002*مع')
2017-12-16 18:21:01,155 : INFO : (8, '0.003*كدا + 0.002*و + 0.002*في + 0.002*من + 0.002*انا + 0.002*يا +

2017-12-16 18:22:54,333 : INFO : (0, '0.020*كدا + 0.019*و + 0.017*مش + 0.016*انا + 0.016*من + 0.013*في + 0.011*بس + 0.010*، + 0.009*اللي + 0.009*ف')
2017-12-16 18:22:54,340 : INFO : (1, '0.004*في + 0.004*و + 0.003*من + 0.003*كدا + 0.003*مش + 0.003*انا + 0.003*يا + 0.002*اللي + 0.002*مِنَ + 0.002*ولا')
2017-12-16 18:22:54,346 : INFO : (2, '0.004*من + 0.004*و + 0.003*كدا + 0.003*، + 0.003*في + 0.003*مش + 0.003*انا + 0.003*يا + 0.002*كل + 0.002*لا')
2017-12-16 18:22:54,353 : INFO : (3, '0.004*في + 0.004*و + 0.003*من + 0.003*كدا + 0.003*مش + 0.003*انا + 0.003*يا + 0.002*، + 0.002*لا + 0.002*كل')
2017-12-16 18:22:54,360 : INFO : (4, '0.004*و + 0.004*من + 0.004*كدا + 0.004*يا + 0.003*في + 0.003*مش + 0.003*انا + 0.002*اللي + 0.002*بس + 0.002*لا')
2017-12-16 18:22:54,366 : INFO : (5, '0.004*و + 0.004*في + 0.003*كدا + 0.003*من + 0.003*مش + 0.003*لا + 0.003*اللي + 0.003*انا + 0.003*يا + 0.003*بس')
2017-12-16 18:22:54,373 : INFO : (6, '0.004*من + 0.003*و + 0.003*كدا + 0.003*في + 0.003*مش + 0.003*

2017-12-16 18:23:52,924 : INFO : (0, '0.021*و + 0.017*مش + 0.017*كدا + 0.017*من + 0.016*انا + 0.014*في + 0.011*بس + 0.010*اللي + 0.009*، + 0.008*يا')
2017-12-16 18:23:52,932 : INFO : (1, '0.005*في + 0.004*و + 0.004*من + 0.003*مش + 0.003*كدا + 0.003*يا + 0.003*انا + 0.003*اللي + 0.002*بس + 0.002*ولا')
2017-12-16 18:23:52,938 : INFO : (2, '0.005*من + 0.005*و + 0.004*في + 0.003*، + 0.003*كدا + 0.003*مش + 0.003*يا + 0.003*انا + 0.003*كل + 0.002*لا')
2017-12-16 18:23:52,945 : INFO : (3, '0.005*و + 0.004*من + 0.004*يا + 0.004*في + 0.004*مش + 0.004*كدا + 0.003*انا + 0.003*اللي + 0.002*بس + 0.002*لا')
2017-12-16 18:23:52,952 : INFO : (4, '0.004*في + 0.004*و + 0.004*من + 0.003*مش + 0.003*كدا + 0.003*يا + 0.003*انا + 0.002*، + 0.002*لا + 0.002*اللي')
2017-12-16 18:23:52,958 : INFO : (5, '0.005*و + 0.004*في + 0.004*من + 0.003*كدا + 0.003*مش + 0.003*يا + 0.003*اللي + 0.003*انا + 0.003*لا + 0.003*بس')
2017-12-16 18:23:52,965 : INFO : (6, '0.004*من + 0.004*و + 0.004*في + 0.003*كدا + 0.003*مش + 0.003

2017-12-16 18:24:53,155 : INFO : (18, '0.005*من + 0.004*و + 0.004*في + 0.003*يا + 0.003*مش + 0.003*انا + 0.003*كدا + 0.002*ولا + 0.002*بس + 0.002*ما')
2017-12-16 18:24:53,162 : INFO : (19, '0.004*و + 0.004*في + 0.004*من + 0.004*كدا + 0.004*انا + 0.003*مش + 0.003*يا + 0.003*اللي + 0.002*لا + 0.002*بس')
2017-12-16 18:24:53,164 : INFO : PROGRESS: finished document 69888 of 179632
2017-12-16 18:25:53,640 : INFO : (0, '0.021*و + 0.019*من + 0.018*مش + 0.018*انا + 0.016*في + 0.015*كدا + 0.011*بس + 0.010*اللي + 0.009*، + 0.009*يا')
2017-12-16 18:25:53,648 : INFO : (1, '0.006*في + 0.005*من + 0.005*و + 0.004*مش + 0.004*يا + 0.004*انا + 0.003*كدا + 0.003*اللي + 0.003*لا + 0.003*بس')
2017-12-16 18:25:53,655 : INFO : (2, '0.006*من + 0.005*و + 0.004*في + 0.004*يا + 0.004*مش + 0.004*انا + 0.004*، + 0.003*كدا + 0.003*كل + 0.003*لا')
2017-12-16 18:25:53,661 : INFO : (3, '0.006*من + 0.005*و + 0.005*يا + 0.005*في + 0.004*مش + 0.004*انا + 0.003*كدا + 0.003*اللي + 0.003*بس + 0.003*لا')
2017-12-16 18:25:53,

2017-12-16 18:28:00,334 : INFO : (0, '0.022*و + 0.019*من + 0.019*مش + 0.018*انا + 0.017*في + 0.013*كدا + 0.011*بس + 0.011*اللي + 0.009*يا + 0.009*،')
2017-12-16 18:28:00,341 : INFO : (1, '0.007*في + 0.006*من + 0.006*و + 0.005*يا + 0.004*مش + 0.004*انا + 0.003*اللي + 0.003*لا + 0.003*كدا + 0.003*بس')
2017-12-16 18:28:00,352 : INFO : (2, '0.007*من + 0.006*و + 0.005*في + 0.005*يا + 0.004*مش + 0.004*انا + 0.004*، + 0.003*لا + 0.003*كل + 0.003*كدا')
2017-12-16 18:28:00,359 : INFO : (3, '0.007*من + 0.006*و + 0.006*في + 0.006*يا + 0.005*مش + 0.004*انا + 0.003*كدا + 0.003*اللي + 0.003*بس + 0.003*لا')
2017-12-16 18:28:00,366 : INFO : (4, '0.006*من + 0.006*في + 0.006*و + 0.004*يا + 0.004*مش + 0.004*انا + 0.003*لا + 0.003*كدا + 0.003*اللي + 0.003*الله')
2017-12-16 18:28:00,372 : INFO : (5, '0.007*من + 0.006*و + 0.005*في + 0.005*يا + 0.004*مش + 0.004*انا + 0.004*، + 0.003*بس + 0.003*كدا + 0.003*اللي')
2017-12-16 18:28:00,379 : INFO : (6, '0.007*و + 0.007*من + 0.006*في + 0.005*يا + 0.004*انا + 0.00

2017-12-16 18:30:12,586 : INFO : (2, '0.008*في + 0.008*من + 0.006*و + 0.005*يا + 0.004*انا + 0.004*مش + 0.004*لا + 0.003*اللي + 0.003*بس + 0.003*ما')
2017-12-16 18:30:12,593 : INFO : (3, '0.008*من + 0.007*و + 0.007*في + 0.005*يا + 0.004*انا + 0.004*لا + 0.004*مش + 0.004*، + 0.003*اللي + 0.003*على')
2017-12-16 18:30:12,600 : INFO : (4, '0.008*من + 0.007*و + 0.007*في + 0.006*يا + 0.004*مش + 0.004*انا + 0.004*لا + 0.003*اللي + 0.003*بس + 0.003*على')
2017-12-16 18:30:12,607 : INFO : (5, '0.008*من + 0.007*و + 0.006*في + 0.005*يا + 0.004*انا + 0.004*، + 0.004*مش + 0.004*لا + 0.003*على + 0.003*بس')
2017-12-16 18:30:12,613 : INFO : (6, '0.008*من + 0.007*في + 0.006*و + 0.005*يا + 0.004*ما + 0.004*انا + 0.004*مش + 0.004*لا + 0.003*اللي + 0.003*كل')
2017-12-16 18:30:12,620 : INFO : (7, '0.008*من + 0.007*في + 0.006*و + 0.005*يا + 0.004*مش + 0.004*انا + 0.004*لا + 0.003*على + 0.003*ما + 0.003*بس')
2017-12-16 18:30:12,626 : INFO : (8, '0.007*من + 0.007*في + 0.006*و + 0.004*يا + 0.004*مش + 0.004*انا 

2017-12-16 18:32:23,051 : INFO : (5, '0.009*من + 0.008*في + 0.006*و + 0.005*يا + 0.005*ما + 0.004*انا + 0.004*لا + 0.004*الله + 0.004*مش + 0.004*على')
2017-12-16 18:32:23,058 : INFO : (6, '0.010*من + 0.007*في + 0.007*و + 0.005*يا + 0.005*، + 0.004*انا + 0.004*لا + 0.004*على + 0.004*مش + 0.004*بس')
2017-12-16 18:32:23,064 : INFO : (7, '0.009*من + 0.007*في + 0.006*و + 0.005*يا + 0.004*لا + 0.004*انا + 0.004*مش + 0.004*على + 0.004*ما + 0.003*بس')
2017-12-16 18:32:23,071 : INFO : (8, '0.009*من + 0.008*في + 0.006*و + 0.004*يا + 0.004*لا + 0.004*انا + 0.004*مش + 0.004*الله + 0.003*، + 0.003*كل')
2017-12-16 18:32:23,078 : INFO : (9, '0.009*من + 0.008*في + 0.007*و + 0.005*لا + 0.005*يا + 0.004*انا + 0.004*اللي + 0.004*ما + 0.004*مش + 0.004*بس')
2017-12-16 18:32:23,084 : INFO : (10, '0.010*من + 0.008*في + 0.007*و + 0.005*يا + 0.004*انا + 0.004*لا + 0.004*مش + 0.004*الله + 0.004*على + 0.004*كل')
2017-12-16 18:32:23,091 : INFO : (11, '0.008*من + 0.007*في + 0.007*و + 0.005*انا + 0.005*يا + 0.004*ل

2017-12-16 18:34:37,449 : INFO : (15, '0.011*من + 0.008*في + 0.007*و + 0.005*يا + 0.005*لا + 0.004*انا + 0.004*ما + 0.004*على + 0.004*مش + 0.004*الله')
2017-12-16 18:34:37,456 : INFO : (16, '0.010*من + 0.008*في + 0.006*و + 0.005*يا + 0.005*لا + 0.004*انا + 0.004*ما + 0.004*على + 0.004*الله + 0.003*مش')
2017-12-16 18:34:37,462 : INFO : (17, '0.010*من + 0.009*في + 0.007*و + 0.005*يا + 0.005*لا + 0.005*انا + 0.004*ما + 0.004*مش + 0.004*على + 0.004*الله')
2017-12-16 18:34:37,469 : INFO : (18, '0.010*من + 0.009*في + 0.007*و + 0.006*يا + 0.005*لا + 0.004*ما + 0.004*انا + 0.004*الله + 0.004*مش + 0.004*،')
2017-12-16 18:34:37,476 : INFO : (19, '0.010*من + 0.008*في + 0.006*و + 0.005*يا + 0.005*انا + 0.005*لا + 0.004*على + 0.004*ما + 0.004*الله + 0.004*اللي')
2017-12-16 18:34:37,478 : INFO : PROGRESS: finished document 159744 of 179632
2017-12-16 18:35:45,425 : INFO : (0, '0.022*و + 0.021*من + 0.018*في + 0.017*انا + 0.017*مش + 0.012*كدا + 0.012*اللي + 0.011*بس + 0.010*، + 0.009*ما')
2017-12-16 1

2017-12-16 18:36:56,563 : INFO : (3, '0.011*من + 0.010*في + 0.007*و + 0.005*لا + 0.005*يا + 0.005*الله + 0.004*انا + 0.004*على + 0.004*ما + 0.004*اللي')
2017-12-16 18:36:56,570 : INFO : (4, '0.012*من + 0.009*في + 0.007*و + 0.006*يا + 0.005*لا + 0.005*على + 0.004*ما + 0.004*انا + 0.004*الله + 0.004*اللي')
2017-12-16 18:36:56,577 : INFO : (5, '0.012*من + 0.009*في + 0.007*و + 0.005*، + 0.005*لا + 0.005*على + 0.005*يا + 0.005*الله + 0.005*ما + 0.004*انا')
2017-12-16 18:36:56,584 : INFO : (6, '0.012*من + 0.010*في + 0.007*و + 0.005*ما + 0.005*لا + 0.005*يا + 0.005*الله + 0.005*على + 0.004*انا + 0.004*،')
2017-12-16 18:36:56,591 : INFO : (7, '0.011*من + 0.009*في + 0.007*و + 0.005*لا + 0.005*يا + 0.005*انا + 0.005*الله + 0.004*على + 0.004*ما + 0.004*بس')
2017-12-16 18:36:56,598 : INFO : (8, '0.012*من + 0.010*في + 0.007*و + 0.006*يا + 0.005*لا + 0.005*الله + 0.005*على + 0.005*انا + 0.004*ما + 0.004*كل')
2017-12-16 18:36:56,604 : INFO : (9, '0.011*من + 0.009*في + 0.007*و + 0.005*لا + 0.005*على +

In [123]:
#print hdp results

In [124]:
from gensim import sklearn_api

In [None]:
gensim.sklearn_api.lsimodel

In [None]:
index = similarities.MatrixSimilarity(corpus_lsi) # index lsi corpus

In [111]:
from gensim.summarization.bm25 import get_bm25_weights

In [112]:
bm25_result = get_bm25_weights(corpus_tfidf)

KeyboardInterrupt: 

For later use

In [None]:
# class MyCorpus(object):
#     def __iter__(self):
#         for line in open('datasets/mycorpus.txt'):
#             # assume there's one document per line, tokens separated by whitespace
#             yield dictionary.doc2bow(line.lower().split())

In [None]:
#corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)