## LSA & LDA 

___(With stop words)___

In [None]:
#!pip install pyldavis

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [2]:
pd.set_option('display.max_colwidth', -1)

_________________________________________________

#### Import pickled stop words from 
`Effects of Stop Words Elimination for Arabic Information Retrieval: A Comparative Study`

In [3]:
stop_words_df = pd.read_pickle('/home/jovyan/capstone-52/topic_modeling_experiments/pickled_stopwords/comp_study_stopwords.p')

In [4]:
stop_words_df.columns = ["stop_words"]

In [5]:
stop_words_df.columns

Index(['stop_words'], dtype='object')

In [6]:
iabuelkhair_stopwords = stop_words_df['stop_words'].tolist()

In [7]:
iabuelkhair_stopwords[:4]

['انها', 'اثناء', 'اجل', 'احدا']

### Access corpus through pickled MongoDB file

In [8]:
cd ../../Pickled_from_mongo/

/home/jovyan/capstone-52/Pickled_from_mongo


In [9]:
df = pd.read_pickle('../Pickled_from_mongo/combined_eg_gulf_200k_sample.p')

In [10]:
df.sample(2)

Unnamed: 0,_id,cleaned_geo,cleaned_name,cleaned_text,class
66870,5a2cbe6b204c9e0400cec618,اللي يسأل مايتوهش,amrdrar,مش لما يبقي عندنا ذكري اصلا,EG
66232,5a2cbe6b204c9e0400cec39a,اللي يسأل مايتوهش,amrdrar,حلو,EG


In [11]:
df = df.drop(['_id', 'cleaned_geo', 'cleaned_name'], axis=1)

In [12]:
df = df.drop_duplicates(['cleaned_text'], keep=False)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179632 entries, 0 to 95683
Data columns (total 2 columns):
cleaned_text    179632 non-null object
class           179632 non-null object
dtypes: object(2)
memory usage: 4.1+ MB


## Benchmark LSA with stop words

### Label Encode the Categories


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
df['class_numerical'] = le.fit_transform(df['class'])

### TFIDF

### Prepare Document Term Matrix


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words= iabuelkhair_stopwords)

In [None]:
document_term_matrix_sps = tfidf_vectorizer.fit_transform(df.cleaned_text)

In [None]:
document_term_matrix_sps


### Compute SVD of Document Term Matrix

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
n_components = 50
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [None]:
svd_matrix = SVD.fit_transform(document_term_matrix_sps)

In [None]:
SVD.explained_variance_ratio_

In [None]:
svd_matrix.shape

### Load SVD Matrix with Documents and Labels


In [None]:
latent_semantic_analysis = pd.DataFrame(svd_matrix,
                                        index=df.index,
                                        columns=component_names)
latent_semantic_analysis['cleaned_text'] = df.cleaned_text
latent_semantic_analysis['class'] = df['class']

In [None]:
latent_semantic_analysis.head()


In [None]:
vocabulary_loadings = pd.DataFrame(SVD.components_,
                                   index=component_names,
                                   columns=tfidf_vectorizer.get_feature_names()).T

In [None]:
vocabulary_loadings['abs_component_1'] = np.abs(vocabulary_loadings.component_1)
vocabulary_loadings['abs_component_2'] = np.abs(vocabulary_loadings.component_2)

### Display Top Terms for Each Component

In [None]:
vocabulary_loadings.sort_values('abs_component_1',ascending=False).head(10)

In [None]:
vocabulary_loadings.sort_values('abs_component_2',ascending=False).head(10)

In [None]:
plt.figure(figsize=(7,7))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,1)
plt.ylim(-.5,1)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

strings = df['cleaned_text'].values
for i, (x, y) in enumerate(zip(pc_1, pc_2)): 
    plt.text(x,y,strings[i][:10])

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.1,1)
plt.ylim(-.1,1)

In [None]:
plt.figure(figsize=(6,6))
pc_1 = latent_semantic_analysis['component_1'].values
pc_2 = latent_semantic_analysis['component_2'].values

plt.scatter(pc_1, pc_2, c=df['class_numerical'], cmap='rainbow')

plt.xlabel('First PC')
plt.ylabel('Second PC')
plt.axvline(linewidth=0.5)
plt.axhline(linewidth=0.5)
plt.xlim(-.01,.5)
plt.ylim(-.3,.6)

In [None]:
eg_mask = latent_semantic_analysis['class'] == 'EG'

In [None]:
latent_semantic_analysis[eg_mask][:5]

In [None]:
gulf_mask = latent_semantic_analysis['class'] == 'GULF'

In [None]:
latent_semantic_analysis[gulf_mask][:5]

In [None]:
latent_semantic_analysis[(latent_semantic_analysis['class'] == 'EG') 
                         & (latent_semantic_analysis.component_2 > .050)][:5]

In [None]:
latent_semantic_analysis[(latent_semantic_analysis['class'] == 'GULF') 
                         & (latent_semantic_analysis.component_2 > .50)][:5]

## GENSIM

In [None]:
#!pip install -U gensim

In [17]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [18]:
import nltk
import gensim
from nltk.text import Text  
from gensim import models, corpora, similarities

2017-12-16 22:50:12,995 : INFO : 'pattern' package not found; tag filters are not available for English


In [22]:
df.sample(2)

Unnamed: 0,cleaned_text,class
34836,الحمدلله من قبل ومن بعد بكره عمليه اخرى لي ونقول ي رب عفوك وتوفيقك ، دعواتكم,GULF
36026,مشاء الله الساعه,EG


In [27]:
df["cleaned_text"] = df["cleaned_text"].map(lambda x: x.split(' '))

In [24]:
df["cleaned_text"].sample(2)

23482    [علي, كدا, روان, دي, ظروفها, ايه]                                                                                                                         
34106    [امرنا, الله, بالامتثال, لاوامر, الوالدين, مادام, ليس, بها, معصية, للخالق،, وانا, شخصيا, م, اذكر, اني, خالفت, اهلي, بشي, ومشيت, ع, رأيي, الا, واتسحف, وا…]
Name: cleaned_text, dtype: object

#### Removing stop words from tokens

In [135]:
len(iabuelkhair_stopwords)

1590

In [27]:
iabuelkhair_stopwords[:3]

['انها', 'اثناء', 'اجل']

In [127]:
def stopword_percentage(text_eda, text_eda_no_stops):
    increase = (text_eda_no_stops) - (text_eda)
    return (increase / text_eda) * 100

In [120]:
empty_list = []
for sentence in df['cleaned_text']:
    empty_list.append([word for word in sentence if word not in iabuelkhair_stopwords])

In [129]:
#texts = [[word for word in document if word not in iabuelkhair_stopwords] for document in df['cleaned_text']]

In [128]:
old_sum = 0
new_sum = 0

for old, new in zip(list(df['cleaned_text']), empty_list):
    old_sum += len(old)
    new_sum += len(new)
    
stopword_percentage(old_sum, new_sum)

-18.615655987951158

In [129]:
df['cleaned_text'] = empty_list

In [130]:
df["cleaned_text"][:4]

0    [أزاي, أقول, زمان, والماضي, الغيب, بكره, واللي, أحنا, دلوقت, كمان, ح, يفوت, ولا, ندري]                                      
1    [آراء, آه, بس, أزاي, أجويرو, منتهي, يعني, أمال, مكنش, الهداف, التاريخي, للسيتي, و, موسم, اقل, الاهداف…]                     
2    [صيني, أزاي, تقارن, شادي, بالخطيب, ألي, الوحيد, ألي, خد, الكورة, الذهبية, مصر, وبشهادت, الزملكاوي, الأ…]                    
4    [التوينز, اللي, معاها, الحلو, والوحش, والمصايب, والاكل, والفشل, مش, عارفه, هعيش, أزاي, ربنا, يخليكي, ليا, ومش, تويته, توصفك]
Name: cleaned_text, dtype: object

#### Pass `df["cleaned_text"]` with gensim's Dictionary representation:

In [28]:
dictionary = corpora.Dictionary(df["cleaned_text"])

2017-12-16 22:56:23,317 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-16 22:56:23,590 : INFO : adding document #10000 to Dictionary(34127 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)
2017-12-16 22:56:23,829 : INFO : adding document #20000 to Dictionary(53319 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)
2017-12-16 22:56:24,141 : INFO : adding document #30000 to Dictionary(69263 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)
2017-12-16 22:56:24,472 : INFO : adding document #40000 to Dictionary(80421 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)
2017-12-16 22:56:24,720 : INFO : adding document #50000 to Dictionary(93339 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)
2017-12-16 22:56:24,939 : INFO : adding document #60000 to Dictionary(106599 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)
2017-12-16 22:56:25,167 : INFO : adding document #70000 to Dictionary(117032 unique tokens

#### Use filter_extremes method to eliminate the 5000 most frequent words

In [48]:
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=5000)

#### Alternatively, 

In [29]:
# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 20
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

2017-12-16 22:56:49,971 : INFO : discarding 255877 tokens: [('والماضي', 6), ('يفوت', 9), ('آراء', 12), ('أجويرو', 2), ('أمال', 10), ('الاهداف…', 1), ('الهداف', 8), ('للسيتي', 3), ('ألي', 7), ('الذهبية', 10)]...
2017-12-16 22:56:49,979 : INFO : keeping 9802 tokens which were in no less than 20 and no more than 89816 (=50.0%) documents
2017-12-16 22:56:50,078 : INFO : resulting dictionary: Dictionary(9802 unique tokens: ['أحنا', 'أزاي', 'أقول', 'الغيب', 'بكره']...)


#### Compactify data to make up for any missing labels

In [30]:
dictionary.compactify()

#### Use doc2bow method to get bag of words representation (word_id, frequency) 

In [31]:
corpus = [dictionary.doc2bow(text) for text in df["cleaned_text"]]

In [None]:
#dictionary.doc2bow(dictionary, allow_update=True, return_missing=False)

#### Save corpus in Market Matrix format 

In [32]:
pwd

'/home/jovyan/capstone-52/Pickled_from_mongo'

In [34]:
cd ../market_matrix_files/

/home/jovyan/capstone-52/market_matrix_files


In [35]:
corpora.MmCorpus.serialize('../market_matrix_files/corpus_filter_2.mm', corpus)

2017-12-16 22:59:39,459 : INFO : storing corpus in Matrix Market format to ../market_matrix_files/corpus_filter_2.mm
2017-12-16 22:59:39,462 : INFO : saving sparse matrix to ../market_matrix_files/corpus_filter_2.mm
2017-12-16 22:59:39,463 : INFO : PROGRESS: saving document #0
2017-12-16 22:59:39,484 : INFO : PROGRESS: saving document #1000
2017-12-16 22:59:39,500 : INFO : PROGRESS: saving document #2000
2017-12-16 22:59:39,518 : INFO : PROGRESS: saving document #3000
2017-12-16 22:59:39,536 : INFO : PROGRESS: saving document #4000
2017-12-16 22:59:39,554 : INFO : PROGRESS: saving document #5000
2017-12-16 22:59:39,573 : INFO : PROGRESS: saving document #6000
2017-12-16 22:59:39,591 : INFO : PROGRESS: saving document #7000
2017-12-16 22:59:39,612 : INFO : PROGRESS: saving document #8000
2017-12-16 22:59:39,633 : INFO : PROGRESS: saving document #9000
2017-12-16 22:59:39,653 : INFO : PROGRESS: saving document #10000
2017-12-16 22:59:39,673 : INFO : PROGRESS: saving document #11000
2017-

2017-12-16 22:59:41,768 : INFO : PROGRESS: saving document #121000
2017-12-16 22:59:41,785 : INFO : PROGRESS: saving document #122000
2017-12-16 22:59:41,801 : INFO : PROGRESS: saving document #123000
2017-12-16 22:59:41,819 : INFO : PROGRESS: saving document #124000
2017-12-16 22:59:41,831 : INFO : PROGRESS: saving document #125000
2017-12-16 22:59:41,843 : INFO : PROGRESS: saving document #126000
2017-12-16 22:59:41,854 : INFO : PROGRESS: saving document #127000
2017-12-16 22:59:41,868 : INFO : PROGRESS: saving document #128000
2017-12-16 22:59:41,891 : INFO : PROGRESS: saving document #129000
2017-12-16 22:59:41,904 : INFO : PROGRESS: saving document #130000
2017-12-16 22:59:41,916 : INFO : PROGRESS: saving document #131000
2017-12-16 22:59:41,928 : INFO : PROGRESS: saving document #132000
2017-12-16 22:59:41,945 : INFO : PROGRESS: saving document #133000
2017-12-16 22:59:41,965 : INFO : PROGRESS: saving document #134000
2017-12-16 22:59:41,986 : INFO : PROGRESS: saving document #13

#### Load corpus iterator from Matrix Market file

In [36]:
corpus = corpora.MmCorpus('../market_matrix_files/corpus_filter_2.mm')

2017-12-16 22:59:58,594 : INFO : loaded corpus index from ../market_matrix_files/corpus_filter_2.mm.index
2017-12-16 22:59:58,600 : INFO : initializing corpus reader from ../market_matrix_files/corpus_filter_2.mm
2017-12-16 22:59:58,601 : INFO : accepted corpus with 179632 documents, 9802 features, 1192060 non-zero entries


#### Explore the transformed corpus

In [37]:
print(corpus[2])

[(1, 1.0), (26, 1.0), (32, 1.0), (33, 1.0), (34, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0)]


## LDA with stopwords

#### Initialize LDA (can only use bow with LDA)

In [38]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary)

2017-12-16 23:00:50,095 : INFO : using symmetric alpha at 0.02
2017-12-16 23:00:50,097 : INFO : using symmetric eta at 0.02
2017-12-16 23:00:50,100 : INFO : using serial LDA version on this node
2017-12-16 23:00:56,686 : INFO : running online (single-pass) LDA training, 50 topics, 1 passes over the supplied corpus of 179632 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2017-12-16 23:00:56,750 : INFO : PROGRESS: pass 0, at document #2000/179632
2017-12-16 23:01:00,056 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:01:00,167 : INFO : topic #24 (0.020): 0.024*"ده" + 0.019*"اهه" + 0.017*"بس" + 0.016*"فـ" + 0.016*"اوي" + 0.016*"مش" + 0.016*"ايه" + 0.015*"والله" + 0.012*"هو" + 0.012*"الحمد"
2017-12-16 23:01:00,170 : INFO : topic #31 (0.020): 0.038*"بس" + 0.029*"في" + 0.027*"دي" + 0.024*"لا" + 0.019*"هههههههههههههههههههه" + 0.019*"على" + 0.014

2017-12-16 23:01:22,373 : INFO : topic #45 (0.020): 0.069*"كدا" + 0.044*"قبل" + 0.039*"جدا" + 0.031*"أنت" + 0.029*"اصلا" + 0.029*"ما" + 0.028*"يا" + 0.023*"النهارده" + 0.022*"عني" + 0.017*"في"
2017-12-16 23:01:22,375 : INFO : topic #12 (0.020): 0.212*"أنا" + 0.042*"إني" + 0.036*"في" + 0.027*"منه" + 0.025*"من" + 0.020*"حب" + 0.016*"الله" + 0.015*"أكبر" + 0.015*"اللي" + 0.015*"مش"
2017-12-16 23:01:22,378 : INFO : topic diff=0.300669, rho=0.377964
2017-12-16 23:01:22,418 : INFO : PROGRESS: pass 0, at document #16000/179632
2017-12-16 23:01:25,442 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:01:25,767 : INFO : topic #5 (0.020): 0.051*"نفس" + 0.034*"قلب" + 0.034*"دول" + 0.033*"اللي" + 0.033*"رمضان" + 0.027*"لكل" + 0.021*"منين" + 0.020*"قدام" + 0.020*"اخر" + 0.020*"فيا"
2017-12-16 23:01:25,770 : INFO : topic #43 (0.020): 0.050*"خالص" + 0.031*"بس" + 0.029*"شويه" + 0.027*"حصل" + 0.026*"مكان" + 0.023*"كان" + 0.021*"بينا" + 0.021*"علي" + 0.019*"لا" 

2017-12-16 23:01:51,368 : INFO : topic #12 (0.020): 0.119*"أنا" + 0.031*"حب" + 0.028*"منه" + 0.027*"وانتي" + 0.024*"إني" + 0.024*"ازاى" + 0.024*"باين" + 0.023*"في" + 0.021*"من" + 0.020*"بس"
2017-12-16 23:01:51,370 : INFO : topic #18 (0.020): 0.055*"اه" + 0.047*"مش" + 0.044*"احسن" + 0.035*"انا" + 0.035*"اللى" + 0.034*"لو" + 0.026*"بس" + 0.023*"انت" + 0.022*"لا" + 0.021*"حد"
2017-12-16 23:01:51,372 : INFO : topic #0 (0.020): 0.157*"الي" + 0.054*"ما" + 0.036*"على" + 0.036*"ل" + 0.036*"ا…" + 0.019*"طبيعي" + 0.019*"في" + 0.018*"من" + 0.017*"ح" + 0.016*"كل"
2017-12-16 23:01:51,375 : INFO : topic diff=0.224736, rho=0.267261
2017-12-16 23:01:51,409 : INFO : PROGRESS: pass 0, at document #30000/179632
2017-12-16 23:01:54,129 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:01:54,336 : INFO : topic #2 (0.020): 0.311*"عشان" + 0.077*"اى" + 0.044*"ب" + 0.034*"بئا" + 0.025*"الاول" + 0.022*"معرفش" + 0.022*"هما" + 0.018*"من" + 0.018*"بس" + 0.018*"الحمدلله"
20

2017-12-16 23:02:16,387 : INFO : topic #22 (0.020): 0.054*"وانت" + 0.043*"انك" + 0.041*"كل" + 0.040*"قد" + 0.033*"من" + 0.031*"طيب" + 0.028*"تكون" + 0.026*"احلى" + 0.026*"قلبك" + 0.025*"حاجات"
2017-12-16 23:02:16,390 : INFO : topic #24 (0.020): 0.116*"محدش" + 0.061*"واحده" + 0.042*"سنه" + 0.036*"احساس" + 0.034*"زى" + 0.025*"ههههه" + 0.025*"هى" + 0.023*"فـ" + 0.023*"روح" + 0.021*"ضحك"
2017-12-16 23:02:16,393 : INFO : topic #12 (0.020): 0.184*"أنا" + 0.039*"منه" + 0.035*"حب" + 0.032*"إني" + 0.027*"اذاكر" + 0.021*"وانتي" + 0.020*"من" + 0.019*"في" + 0.019*"باين" + 0.017*"مش"
2017-12-16 23:02:16,395 : INFO : topic #8 (0.020): 0.064*"حياتي" + 0.063*"عليا" + 0.047*"نفسك" + 0.038*"الواحد" + 0.036*"لما" + 0.035*"كان" + 0.034*"من" + 0.030*"لله" + 0.027*"الحمد" + 0.025*"حد"
2017-12-16 23:02:16,398 : INFO : topic #31 (0.020): 0.318*"ف" + 0.064*"وانا" + 0.048*"بالله" + 0.042*"انا" + 0.040*"قلبي" + 0.038*"خلاص" + 0.036*"اقسم" + 0.027*"مهما" + 0.017*"دي" + 0.013*"الكليه"
2017-12-16 23:02:16,400 : INF

2017-12-16 23:02:35,507 : INFO : topic diff=0.178135, rho=0.192450
2017-12-16 23:02:35,549 : INFO : PROGRESS: pass 0, at document #56000/179632
2017-12-16 23:02:38,283 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:02:38,420 : INFO : topic #39 (0.020): 0.071*"*" + 0.067*"رب" + 0.056*"حتى" + 0.045*"صح" + 0.032*"إنه" + 0.031*"اما" + 0.028*"’" + 0.026*"له" + 0.025*"حوار" + 0.024*"هه"
2017-12-16 23:02:38,423 : INFO : topic #3 (0.020): 0.159*"(" + 0.132*""" + 0.088*"…" + 0.075*"فيها" + 0.026*"أن" + 0.018*"انتو" + 0.015*"عارفين" + 0.015*"كل" + 0.014*"من" + 0.014*"في"
2017-12-16 23:02:38,425 : INFO : topic #19 (0.020): 0.137*"اي" + 0.056*"حاجه" + 0.040*"شباب" + 0.037*"تويتر" + 0.032*"عندك" + 0.028*"في" + 0.026*"اهو" + 0.025*"بقا" + 0.022*"شبه" + 0.022*"إحنا"
2017-12-16 23:02:38,428 : INFO : topic #23 (0.020): 0.154*"ولا" + 0.150*"ايه" + 0.087*"بقي" + 0.051*"انا" + 0.041*"مش" + 0.036*"ده" + 0.027*"الدنيا" + 0.026*"ي" + 0.022*"اللي" + 0.021*"م"
2017-

2017-12-16 23:02:59,382 : INFO : topic #9 (0.020): 0.163*"فى" + 0.115*"الناس" + 0.054*"كوباية" + 0.052*"من" + 0.047*"اكتر" + 0.029*"مع" + 0.025*"ياريت" + 0.024*"اللي" + 0.020*"حد" + 0.018*"لما"
2017-12-16 23:02:59,384 : INFO : topic #7 (0.020): 0.125*"الشتا" + 0.062*"اول" + 0.061*"اللهم" + 0.033*"لكن" + 0.030*"عمرو" + 0.027*"لان" + 0.023*"الوكر" + 0.022*"ساعة" + 0.018*"دايما" + 0.017*"انتى"
2017-12-16 23:02:59,387 : INFO : topic diff=0.143337, rho=0.171499
2017-12-16 23:02:59,436 : INFO : PROGRESS: pass 0, at document #70000/179632
2017-12-16 23:03:02,400 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:03:02,525 : INFO : topic #14 (0.020): 0.088*"تاني" + 0.041*"شئ" + 0.033*"من" + 0.033*"حد" + 0.025*"انا" + 0.024*"عملت" + 0.020*"ازيك" + 0.019*"الايام" + 0.019*"كل" + 0.018*"اللي"
2017-12-16 23:03:02,528 : INFO : topic #2 (0.020): 0.257*"عشان" + 0.048*"هما" + 0.041*"ب" + 0.031*"الاول" + 0.029*"اى" + 0.023*"معرفش" + 0.021*"رغم" + 0.021*"بس" + 0.0

2017-12-16 23:03:23,250 : INFO : topic #20 (0.020): 0.082*"فين" + 0.031*"التي" + 0.029*"يابنتي" + 0.029*"راح" + 0.027*"وربنا" + 0.025*"عايزين" + 0.025*"احمد" + 0.025*"الله" + 0.021*"تلاقي" + 0.021*"برضه"
2017-12-16 23:03:23,252 : INFO : topic #8 (0.020): 0.065*"لما" + 0.051*"كدة" + 0.050*"نفسك" + 0.043*"حياتي" + 0.039*"عليا" + 0.037*"من" + 0.035*"الواحد" + 0.033*"كان" + 0.030*"لله" + 0.028*"حد"
2017-12-16 23:03:23,254 : INFO : topic #44 (0.020): 0.277*"،" + 0.074*"نفسي" + 0.066*"عارف" + 0.042*"منك" + 0.034*"فشخ" + 0.026*"مش" + 0.021*"هناك" + 0.020*"انا" + 0.020*"وبعدين" + 0.014*"ينفع"
2017-12-16 23:03:23,257 : INFO : topic #14 (0.020): 0.075*"تاني" + 0.042*"حد" + 0.038*"من" + 0.037*"شئ" + 0.025*"كل" + 0.025*"انا" + 0.024*"اللي" + 0.021*"الايام" + 0.020*"عملت" + 0.019*"مش"
2017-12-16 23:03:23,260 : INFO : topic diff=0.086276, rho=0.156174
2017-12-16 23:03:23,301 : INFO : PROGRESS: pass 0, at document #84000/179632
2017-12-16 23:03:26,012 : INFO : merging changes from 2000 documents into

2017-12-16 23:03:42,961 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:03:43,095 : INFO : topic #29 (0.020): 0.088*"لازم" + 0.062*"خير" + 0.054*"يسعد" + 0.049*"الحياة" + 0.037*"كبير" + 0.031*"على" + 0.022*"كل" + 0.021*"انتوا" + 0.019*"مساك" + 0.018*"الكلية"
2017-12-16 23:03:43,098 : INFO : topic #15 (0.020): 0.102*"ال" + 0.093*"إن" + 0.065*"في" + 0.045*"العالم" + 0.025*"اروح" + 0.023*"اكل" + 0.017*"الجو" + 0.016*"الصبر" + 0.015*"مما" + 0.014*"من"
2017-12-16 23:03:43,101 : INFO : topic #31 (0.020): 0.168*"ف" + 0.085*"وانا" + 0.072*"قلبي" + 0.056*"خلاص" + 0.049*"بالله" + 0.031*"شنو" + 0.029*"مهما" + 0.023*"اقسم" + 0.020*"انا" + 0.013*"العظيم"
2017-12-16 23:03:43,103 : INFO : topic #32 (0.020): 0.074*"او" + 0.070*"ممكن" + 0.060*"مو" + 0.045*"لك" + 0.044*"يكون" + 0.033*"حق" + 0.026*"يبقى" + 0.026*"البيت" + 0.025*"فعلا" + 0.022*"يمكن"
2017-12-16 23:03:43,106 : INFO : topic #22 (0.020): 0.053*"طيب" + 0.048*"من" + 0.044*"كل" + 0.043*"وانت" + 0.036*

2017-12-16 23:04:04,006 : INFO : topic #19 (0.020): 0.151*"اي" + 0.062*"إلى" + 0.041*"هلا" + 0.037*"حاجه" + 0.035*"|" + 0.034*"في" + 0.032*"قلت" + 0.029*"كيف" + 0.024*"عندك" + 0.022*"حزب"
2017-12-16 23:04:04,009 : INFO : topic diff=0.093660, rho=0.136083
2017-12-16 23:04:04,061 : INFO : PROGRESS: pass 0, at document #110000/179632
2017-12-16 23:04:07,001 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:04:07,145 : INFO : topic #14 (0.020): 0.040*"من" + 0.035*"اهم" + 0.034*"وزير" + 0.032*"ادري" + 0.031*"تاني" + 0.025*"بـ" + 0.021*"كل" + 0.020*"في" + 0.018*"أبناء" + 0.017*"حد"
2017-12-16 23:04:07,148 : INFO : topic #11 (0.020): 0.157*"بعد" + 0.108*"من" + 0.093*"غير" + 0.070*"أو" + 0.026*"الشخص" + 0.024*"عنك" + 0.022*"مفيش" + 0.019*"القانون" + 0.012*"السابق" + 0.012*"وأنا"
2017-12-16 23:04:07,150 : INFO : topic #32 (0.020): 0.095*"مو" + 0.070*"او" + 0.046*"يكون" + 0.045*"لك" + 0.042*"حق" + 0.032*"ممكن" + 0.028*"ضد" + 0.022*"ان" + 0.020*"عن" + 0.0

2017-12-16 23:04:28,361 : INFO : topic #44 (0.020): 0.391*"،" + 0.048*"منك" + 0.042*"نفسي" + 0.030*"هناك" + 0.020*"شيئًا" + 0.017*"الشيخ" + 0.013*"م…" + 0.013*"وصلت" + 0.012*"واضح" + 0.011*"كل"
2017-12-16 23:04:28,364 : INFO : topic #9 (0.020): 0.143*"الناس" + 0.079*"من" + 0.048*"ومن" + 0.039*"اللهُمَ" + 0.035*"فى" + 0.031*"هذي" + 0.025*"اللي" + 0.025*"الأشياء" + 0.024*"على" + 0.023*"كل"
2017-12-16 23:04:28,366 : INFO : topic #41 (0.020): 0.081*"في" + 0.065*"أحد" + 0.057*"كانت" + 0.047*"لها" + 0.043*"اقول" + 0.028*"كلها" + 0.024*"بنت" + 0.020*"مصر" + 0.020*"جميلة" + 0.018*"لحظة"
2017-12-16 23:04:28,369 : INFO : topic diff=0.039822, rho=0.128037
2017-12-16 23:04:28,413 : INFO : PROGRESS: pass 0, at document #124000/179632
2017-12-16 23:04:31,089 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:04:31,225 : INFO : topic #47 (0.020): 0.067*"مثل" + 0.052*"من" + 0.047*"بكل" + 0.043*"معك" + 0.034*"بأن" + 0.034*"بلا" + 0.027*"لأن" + 0.026*"لكم" + 0.02

2017-12-16 23:04:47,001 : INFO : topic #12 (0.020): 0.071*"أنا" + 0.042*"منه" + 0.039*"من" + 0.036*"الكويت" + 0.032*"كم" + 0.027*"حب" + 0.026*"في" + 0.022*"الشعب" + 0.021*"أكبر" + 0.020*"آخر"
2017-12-16 23:04:47,003 : INFO : topic #40 (0.020): 0.081*"عليه" + 0.052*"شخص" + 0.044*"من" + 0.040*"محد" + 0.034*"ربي" + 0.031*"كما" + 0.031*"حين" + 0.028*"على" + 0.026*"وكل" + 0.024*"بنات"
2017-12-16 23:04:47,006 : INFO : topic #33 (0.020): 0.086*"الا" + 0.086*"الذي" + 0.059*"من" + 0.057*"هي" + 0.043*"اكثر" + 0.034*"الى" + 0.022*"على" + 0.021*"هههههه" + 0.019*"ياخي" + 0.019*"في"
2017-12-16 23:04:47,008 : INFO : topic #7 (0.020): 0.074*"اللهم" + 0.066*"لكن" + 0.045*"بدون" + 0.040*"اول" + 0.034*"من" + 0.033*"فيني" + 0.032*"تم" + 0.028*"منو" + 0.027*"لان" + 0.025*"النصر"
2017-12-16 23:04:47,011 : INFO : topic diff=0.104805, rho=0.121268
2017-12-16 23:04:47,068 : INFO : PROGRESS: pass 0, at document #138000/179632
2017-12-16 23:04:49,822 : INFO : merging changes from 2000 documents into a model of 1

2017-12-16 23:05:06,172 : INFO : topic diff=0.056751, rho=0.116248
2017-12-16 23:05:06,213 : INFO : PROGRESS: pass 0, at document #150000/179632
2017-12-16 23:05:08,843 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:05:08,970 : INFO : topic #17 (0.020): 0.111*"انا" + 0.076*"اني" + 0.061*"بس" + 0.055*"كنت" + 0.048*"ما" + 0.025*"اويلي" + 0.025*"ودي" + 0.023*"من" + 0.022*"منها" + 0.021*"احس"
2017-12-16 23:05:08,973 : INFO : topic #25 (0.020): 0.160*"هذا" + 0.095*"لي" + 0.064*"علي" + 0.041*"من" + 0.029*"به" + 0.029*"ترا" + 0.028*"ما" + 0.018*"مبروك" + 0.017*"في" + 0.015*"اشوف"
2017-12-16 23:05:08,976 : INFO : topic #42 (0.020): 0.524*"لا" + 0.056*"إلا" + 0.041*"إذا" + 0.020*"ذي" + 0.019*"ما" + 0.014*"️️" + 0.012*"انا" + 0.012*"من" + 0.011*"الحياه" + 0.011*"على"
2017-12-16 23:05:08,978 : INFO : topic #0 (0.020): 0.135*"الي" + 0.069*"على" + 0.063*"محمد" + 0.037*"من" + 0.032*"في" + 0.027*"ا…" + 0.027*"متى" + 0.023*"ما" + 0.016*"العربية" + 0.015*"ال

2017-12-16 23:05:30,945 : INFO : topic #34 (0.020): 0.048*"الاتحاد" + 0.044*"ثم" + 0.041*"قناة" + 0.040*"ولو" + 0.039*"الحين" + 0.038*"العرب" + 0.037*"نحن" + 0.035*"تقول" + 0.034*"عادي" + 0.031*"في"
2017-12-16 23:05:30,948 : INFO : topic #1 (0.020): 0.102*"صباح" + 0.090*"هذه" + 0.068*"عليك" + 0.060*"الخير" + 0.042*"عند" + 0.037*"فقط" + 0.035*"صباحك" + 0.027*"منهم" + 0.024*"جميل" + 0.020*"لمن"
2017-12-16 23:05:30,950 : INFO : topic diff=0.042443, rho=0.111111
2017-12-16 23:05:30,988 : INFO : PROGRESS: pass 0, at document #164000/179632
2017-12-16 23:05:33,460 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:05:33,580 : INFO : topic #37 (0.020): 0.117*"عن" + 0.070*"بل" + 0.052*"عندي" + 0.034*"فلا" + 0.033*"،،" + 0.025*"من" + 0.024*"منذ" + 0.023*"فقد" + 0.021*"شوي" + 0.017*"تاريخ"
2017-12-16 23:05:33,583 : INFO : topic #32 (0.020): 0.091*"لك" + 0.082*"مو" + 0.048*"او" + 0.038*"ضد" + 0.034*"يكون" + 0.026*"حق" + 0.022*"بخير" + 0.021*"يمكن" + 0.021*

2017-12-16 23:05:50,908 : INFO : topic #32 (0.020): 0.137*"لك" + 0.055*"مو" + 0.046*"او" + 0.041*"يكون" + 0.024*"ضد" + 0.022*"بخير" + 0.021*"يمكن" + 0.019*"من" + 0.019*"وهذا" + 0.018*"حق"
2017-12-16 23:05:50,910 : INFO : topic #29 (0.020): 0.088*"خير" + 0.087*"الحياة" + 0.038*"نعم" + 0.036*"لازم" + 0.031*"على" + 0.030*"كل" + 0.026*"صورة" + 0.024*"يسعد" + 0.020*"الأرض" + 0.020*"كبير"
2017-12-16 23:05:50,913 : INFO : topic #48 (0.020): 0.078*"في" + 0.070*"من" + 0.056*"قال" + 0.054*"الوطني" + 0.038*"ال…" + 0.036*"أجمل" + 0.033*"أكثر" + 0.024*"المجيد" + 0.019*"عيني" + 0.018*"القطري"
2017-12-16 23:05:50,916 : INFO : topic diff=0.031262, rho=0.106600
2017-12-16 23:05:50,956 : INFO : PROGRESS: pass 0, at document #178000/179632
2017-12-16 23:05:53,572 : INFO : merging changes from 2000 documents into a model of 179632 documents
2017-12-16 23:05:53,697 : INFO : topic #47 (0.020): 0.074*"مثل" + 0.052*"بكل" + 0.043*"من" + 0.036*"لكم" + 0.036*"معك" + 0.031*"الذين" + 0.030*"بلا" + 0.029*"بأن" + 0.

#### Print topics and the top terms associated with them 

In [142]:
print(ldamodel.print_topics(num_topics=500, num_words=3))

2017-12-16 21:57:18,559 : INFO : topic #0 (0.333): 0.055*"و" + 0.032*"يا" + 0.027*"الله"
2017-12-16 21:57:18,560 : INFO : topic #1 (0.333): 0.048*"من" + 0.041*"في" + 0.023*"لا"
2017-12-16 21:57:18,561 : INFO : topic #2 (0.333): 0.024*"انا" + 0.019*"من" + 0.017*"بس"


[(0, '0.055*"و" + 0.032*"يا" + 0.027*"الله"'), (1, '0.048*"من" + 0.041*"في" + 0.023*"لا"'), (2, '0.024*"انا" + 0.019*"من" + 0.017*"بس"')]


#### Save the LDA model

In [23]:
#ldamodel.save('../topic_modeling_experiments/lsa_lda/model_lda.pkl') 
lda = models.LdaModel.load('../topic_modeling_experiments/lsa_lda/model.lda')

2017-12-16 22:54:34,124 : INFO : loading LdaModel object from ../topic_modeling_experiments/lsa_lda/model.lda
2017-12-16 22:54:34,127 : INFO : loading expElogbeta from ../topic_modeling_experiments/lsa_lda/model.lda.expElogbeta.npy with mmap=None
2017-12-16 22:54:34,130 : INFO : setting ignored attribute dispatcher to None
2017-12-16 22:54:34,132 : INFO : setting ignored attribute state to None
2017-12-16 22:54:34,132 : INFO : setting ignored attribute id2word to None
2017-12-16 22:54:34,133 : INFO : loaded ../topic_modeling_experiments/lsa_lda/model.lda
2017-12-16 22:54:34,134 : INFO : loading LdaState object from ../topic_modeling_experiments/lsa_lda/model.lda.state
2017-12-16 22:54:34,141 : INFO : loaded ../topic_modeling_experiments/lsa_lda/model.lda.state


#### Initialize tfidf to use with LSA

In [77]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

2017-12-16 06:07:18,380 : INFO : collecting document frequencies
2017-12-16 06:07:18,381 : INFO : PROGRESS: processing document #0
2017-12-16 06:07:18,414 : INFO : PROGRESS: processing document #10000
2017-12-16 06:07:18,446 : INFO : PROGRESS: processing document #20000
2017-12-16 06:07:18,468 : INFO : PROGRESS: processing document #30000
2017-12-16 06:07:18,487 : INFO : PROGRESS: processing document #40000
2017-12-16 06:07:18,508 : INFO : PROGRESS: processing document #50000
2017-12-16 06:07:18,527 : INFO : PROGRESS: processing document #60000
2017-12-16 06:07:18,547 : INFO : PROGRESS: processing document #70000
2017-12-16 06:07:18,566 : INFO : PROGRESS: processing document #80000
2017-12-16 06:07:18,585 : INFO : PROGRESS: processing document #90000
2017-12-16 06:07:18,604 : INFO : PROGRESS: processing document #100000
2017-12-16 06:07:18,624 : INFO : PROGRESS: processing document #110000
2017-12-16 06:07:18,644 : INFO : PROGRESS: processing document #120000
2017-12-16 06:07:18,668 : 

#### Use the model to transform vectors

In [79]:
doc_bow = [(0, 2), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.9029959618571598), (1, 0.42964903452662695)]


#### Save the tfidf model

In [16]:
#tfidf.save('../topic_modeling_experiments/lsa_lda/model_tfidf.pkl') 
tfidf = models.LsiModel.load('../topic_modeling_experiments/lsa_lda/model_tfidf.pkl')

NameError: name 'models' is not defined

#### Apply lsi transformation to the whole tfidf transformed corpus

tfidif corpus

In [15]:
corpus_tfidf = tfidf[corpus]

NameError: name 'tfidf' is not defined

lsi corpus trained on 500 topics

In [140]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf]

2017-12-16 21:14:26,080 : INFO : using serial LSI version on this node
2017-12-16 21:14:26,082 : INFO : updating model with new documents
2017-12-16 21:14:26,425 : INFO : preparing a new chunk of documents
2017-12-16 21:14:26,526 : INFO : using 100 extra samples and 2 power iterations
2017-12-16 21:14:26,532 : INFO : 1st phase: constructing (5000, 600) action matrix
2017-12-16 21:14:27,231 : INFO : orthonormalizing (5000, 600) action matrix
2017-12-16 21:14:29,215 : INFO : 2nd phase: running dense svd on (600, 20000) matrix
2017-12-16 21:14:33,146 : INFO : computing the final decomposition
2017-12-16 21:14:33,153 : INFO : keeping 500 factors (discarding 5.697% of energy spectrum)
2017-12-16 21:14:33,283 : INFO : processed documents up to #20000
2017-12-16 21:14:33,286 : INFO : topic #0(16.672): 0.422*"و" + 0.284*"مش" + 0.270*"،" + 0.268*"يا" + 0.202*"ده" + 0.197*"في" + 0.193*"انا" + 0.175*"اللي" + 0.171*"من" + 0.152*"بس"
2017-12-16 21:14:33,289 : INFO : topic #1(12.675): 0.631*"،" + -0

2017-12-16 21:15:10,025 : INFO : topic #4(23.188): -0.768*"انا" + 0.366*"مش" + -0.242*"و" + 0.205*"لا" + 0.134*"ايه" + 0.128*"في" + 0.127*"ده" + -0.122*"كدا" + 0.112*"ولا" + 0.100*"دي"
2017-12-16 21:15:10,396 : INFO : preparing a new chunk of documents
2017-12-16 21:15:10,489 : INFO : using 100 extra samples and 2 power iterations
2017-12-16 21:15:10,494 : INFO : 1st phase: constructing (5000, 600) action matrix
2017-12-16 21:15:11,194 : INFO : orthonormalizing (5000, 600) action matrix
2017-12-16 21:15:13,177 : INFO : 2nd phase: running dense svd on (600, 20000) matrix
2017-12-16 21:15:17,408 : INFO : computing the final decomposition
2017-12-16 21:15:17,415 : INFO : keeping 500 factors (discarding 6.306% of energy spectrum)
2017-12-16 21:15:17,535 : INFO : merging projections: (5000, 500) + (5000, 500)
2017-12-16 21:15:19,151 : INFO : keeping 500 factors (discarding 7.657% of energy spectrum)
2017-12-16 21:15:19,375 : INFO : processed documents up to #120000
2017-12-16 21:15:19,383 :

#### Print results

In [137]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [141]:
pp.pprint(lsi.print_topics(500))

2017-12-16 21:15:58,024 : INFO : topic #0(41.518): 0.343*"و" + 0.319*"من" + 0.314*"يا" + 0.271*"لا" + 0.251*"في" + 0.250*"انا" + 0.217*"الله" + 0.170*"ما" + 0.163*"مش" + 0.154*"بس"
2017-12-16 21:15:58,025 : INFO : topic #1(33.439): -0.922*"يا" + 0.187*"و" + 0.176*"لا" + 0.159*"من" + 0.101*"في" + 0.093*"انا" + 0.054*"،" + 0.048*"اللي" + -0.046*"الله" + 0.042*"ما"
2017-12-16 21:15:58,026 : INFO : topic #2(30.922): -0.606*"الله" + 0.556*"انا" + -0.241*"من" + 0.199*"مش" + 0.175*"كدا" + 0.173*"بس" + 0.170*"والله" + -0.125*"في" + -0.106*"على" + -0.105*"لا"
2017-12-16 21:15:58,032 : INFO : topic #3(30.542): 0.796*"و" + -0.548*"لا" + -0.148*"من" + -0.114*"ولا" + -0.094*"انا" + -0.064*"والله" + -0.058*"بس" + 0.034*"الله" + -0.034*"على" + -0.033*"مش"
2017-12-16 21:15:58,033 : INFO : topic #4(29.819): 0.732*"لا" + -0.406*"من" + 0.401*"و" + -0.205*"في" + -0.113*"والله" + -0.104*"انا" + -0.095*"الله" + 0.094*"يا" + -0.083*"اللي" + -0.072*"على"
2017-12-16 21:15:58,034 : INFO : topic #5(29.029): -0.7

2017-12-16 21:15:58,108 : INFO : topic #44(17.113): -0.654*"اللهم" + -0.300*"الناس" + -0.248*"فى" + 0.240*"أن" + 0.215*"هذا" + -0.208*"ربنا" + 0.175*"دا" + -0.168*"الي" + 0.128*"يارب" + 0.128*"ان"
2017-12-16 21:15:58,109 : INFO : topic #45(17.020): 0.588*"ربنا" + -0.486*"حد" + 0.277*"أن" + -0.204*"الي" + 0.188*"فى" + 0.169*"الناس" + 0.167*"هذا" + -0.146*"عن" + 0.138*"…" + -0.137*"أنا"
2017-12-16 21:15:58,110 : INFO : topic #46(16.973): 0.534*"حد" + 0.435*"هذا" + 0.277*"أنا" + 0.274*"فى" + 0.263*"أن" + -0.222*"لما" + -0.179*"ان" + -0.137*"هو" + 0.131*"ربنا" + -0.130*"عن"
2017-12-16 21:15:58,120 : INFO : topic #47(16.850): 0.489*"الناس" + 0.413*"أن" + -0.399*"ربنا" + -0.367*"هذا" + 0.186*"أنا" + 0.167*"قلبي" + 0.162*"فى" + -0.145*"طيب" + -0.134*"حد" + -0.121*"عن"
2017-12-16 21:15:58,121 : INFO : topic #48(16.775): -0.514*"فى" + 0.409*"ربنا" + -0.284*"وانا" + 0.275*"الي" + 0.273*"أن" + -0.242*"هذا" + 0.175*"حد" + -0.161*"…" + -0.149*"طيب" + 0.136*"عليك"
2017-12-16 21:15:58,122 : INFO : to

2017-12-16 21:15:58,200 : INFO : topic #86(13.850): 0.711*"ناس" + 0.319*"كتير" + -0.292*"اني" + -0.188*"قبل" + -0.171*"بالله" + 0.158*"عادي" + 0.132*"جدا" + 0.126*"الا" + -0.123*"فيه" + 0.114*"او"
2017-12-16 21:15:58,201 : INFO : topic #87(13.763): -0.543*"نفسي" + 0.387*"اني" + 0.339*"ناس" + -0.336*"اذا" + -0.230*"عادي" + -0.228*"جدا" + 0.184*"قبل" + -0.170*"او" + -0.152*"ممكن" + 0.126*"كان"
2017-12-16 21:15:58,202 : INFO : topic #88(13.747): -0.814*"قبل" + -0.364*"م" + 0.203*"اني" + -0.201*"نفسي" + 0.108*"(" + -0.082*"عادي" + 0.079*"شي" + -0.077*"طب" + 0.069*"حتى" + -0.069*"ناس"
2017-12-16 21:15:58,203 : INFO : topic #89(13.708): 0.833*"عليه" + 0.243*"جدا" + 0.210*"كمان" + -0.207*"او" + -0.126*"اذا" + 0.116*"عادي" + -0.115*"بالله" + -0.108*"حلو" + -0.101*"(" + -0.087*"حتى"
2017-12-16 21:15:58,204 : INFO : topic #90(13.685): 0.583*"جدا" + -0.348*"عليه" + -0.348*"اذا" + 0.301*"عادي" + -0.268*"نفسي" + 0.262*"كمان" + -0.178*"ناس" + -0.131*"حاجة" + -0.118*"حلو" + 0.115*"ممكن"
2017-12-16 21

2017-12-16 21:15:58,267 : INFO : topic #128(12.195): -0.392*"علشان" + 0.367*"الوكر" + -0.360*"وش" + 0.304*"فولورز" + 0.265*"الشتا" + 0.253*"بقى" + 0.226*"محمد" + 0.201*"الدنيا" + -0.147*"فين" + 0.138*"الخير"
2017-12-16 21:15:58,268 : INFO : topic #129(12.161): -0.489*"الدنيا" + 0.355*"مفيش" + 0.303*"الخير" + -0.236*"منك" + 0.233*"زي" + 0.200*"الوكر" + -0.179*"صباح" + -0.169*"عايز" + 0.165*"فولورز" + -0.150*"محمد"
2017-12-16 21:15:58,269 : INFO : topic #130(12.148): 0.481*"منك" + -0.410*"وش" + 0.388*"مفيش" + -0.270*"علشان" + 0.244*"ليش" + -0.219*"الدنيا" + -0.190*"الوكر" + 0.170*"*" + -0.157*"فولورز" + -0.138*"الشتا"
2017-12-16 21:15:58,270 : INFO : topic #131(12.137): -0.516*"مصر" + 0.338*"*" + 0.326*"الدنيا" + 0.288*"العالم" + 0.260*"مفيش" + 0.255*"علشان" + -0.167*"له" + -0.140*"القلب" + 0.136*"زي" + 0.135*"بجد"
2017-12-16 21:15:58,271 : INFO : topic #132(12.085): 0.638*"*" + -0.286*"له" + -0.278*"منك" + 0.278*"بين" + 0.242*"مصر" + 0.210*"وش" + -0.182*"عايز" + -0.155*"العالم" + -0.132

2017-12-16 21:15:58,309 : INFO : topic #169(10.967): 0.540*"علينا" + 0.290*"يكون" + 0.247*"قد" + -0.232*"احلي" + -0.210*"هذي" + -0.183*"انه" + -0.180*"كلام" + -0.169*"الى" + 0.157*"️️" + -0.143*"إلا"
2017-12-16 21:15:58,310 : INFO : topic #170(10.939): 0.505*"خالص" + 0.417*"دول" + 0.330*"لكن" + -0.284*"الى" + 0.249*"نفس" + 0.245*"قطر" + 0.242*"احلى" + -0.132*"حبيبي" + -0.118*"شنو" + 0.109*"أو"
2017-12-16 21:15:58,311 : INFO : topic #171(10.916): 0.524*"حبيبي" + 0.347*"احلي" + -0.331*"كلها" + 0.290*"يكون" + 0.257*"قد" + -0.203*"️️" + -0.183*"حياتي" + -0.132*"الف" + 0.110*"هذي" + -0.109*"جميل"
2017-12-16 21:15:58,312 : INFO : topic #172(10.887): -0.736*"خالص" + 0.279*"لكن" + 0.220*"احلى" + 0.212*"قطر" + 0.168*"حلوه" + -0.163*"اهو" + -0.137*"الى" + -0.134*"شنو" + -0.131*"️️" + -0.128*"حبيبي"
2017-12-16 21:15:58,313 : INFO : topic #173(10.872): 0.540*"حبيبي" + 0.388*"️️" + -0.383*"علينا" + 0.234*"نفس" + -0.202*"احلي" + -0.166*"شيء" + 0.163*"كلها" + -0.133*"أو" + 0.129*"يكون" + 0.128*"ابو"


2017-12-16 21:15:58,397 : INFO : topic #208(10.086): -0.405*"معايا" + -0.345*"مره" + 0.332*"تويتر" + 0.314*"محدش" + 0.242*"عندك" + 0.234*"منه" + 0.224*"حق" + 0.195*"يمكن" + 0.163*"عم" + 0.145*"اول"
2017-12-16 21:15:58,398 : INFO : topic #209(10.071): 0.587*"ب" + 0.464*"بنت" + 0.259*"منه" + 0.178*"معايا" + -0.173*"فعلا" + 0.158*"لنا" + 0.149*"حلوة" + -0.147*"عندك" + 0.133*"اكيد" + -0.123*"الف"
2017-12-16 21:15:58,399 : INFO : topic #210(10.041): 0.371*"الحياة" + -0.364*"منه" + -0.364*"فعلا" + 0.336*"عندك" + 0.269*"حق" + -0.225*"أنت" + 0.218*"حب" + 0.201*"حلوة" + 0.187*"معايا" + 0.141*"اكيد"
2017-12-16 21:15:58,408 : INFO : topic #211(10.015): -0.339*"بنت" + 0.316*"حب" + -0.267*"اكيد" + -0.261*"الذي" + 0.257*"منه" + 0.225*"عمري" + 0.177*"طول" + 0.176*"حلوة" + 0.167*"الحمدلله" + 0.167*"راح"
2017-12-16 21:15:58,409 : INFO : topic #212(10.008): -0.500*"بنت" + 0.388*"ب" + 0.260*"الذي" + 0.231*"عم" + -0.227*"عمري" + 0.201*"معايا" + 0.192*"رب" + -0.181*"طول" + 0.147*"حب" + -0.137*"الحمدلله"
20

2017-12-16 21:15:58,652 : INFO : topic #250(9.011): 0.450*"مرة" + 0.391*"قلب" + 0.293*"الواحد" + -0.293*"معلش" + -0.267*"⁦️⁩" + 0.199*"شوية" + -0.154*"كلنا" + 0.145*"اكثر" + -0.139*"شخص" + 0.123*"عايزة"
2017-12-16 21:15:58,656 : INFO : topic #251(8.989): -0.470*"ربي" + -0.364*"شوية" + 0.333*"قلب" + 0.248*"معلش" + 0.182*"يسعد" + -0.159*"اكثر" + -0.157*"ريتويت" + -0.151*"كلنا" + -0.150*"بحب" + -0.130*"عليهم"
2017-12-16 21:15:58,657 : INFO : topic #252(8.975): 0.748*"⁦️⁩" + -0.288*"شوية" + 0.243*"مرة" + 0.236*"اكثر" + 0.234*"الواحد" + 0.134*"معلش" + 0.108*"ربي" + 0.102*"طول" + -0.089*"شخص" + 0.086*"اخر"
2017-12-16 21:15:58,660 : INFO : topic #253(8.969): 0.516*"⁦️⁩" + 0.453*"شوية" + -0.385*"معلش" + -0.263*"اكثر" + -0.248*"عاد" + 0.176*"قلب" + -0.154*"ربي" + 0.126*"يسعد" + -0.120*"الواحد" + -0.095*"صباحك"
2017-12-16 21:15:58,661 : INFO : topic #254(8.915): 0.457*"اكثر" + 0.443*"عاد" + -0.247*"شويه" + 0.243*"شوية" + -0.216*"الواحد" + 0.201*"زين" + -0.186*"التي" + -0.182*"ريتويت" + -0.175*"م

2017-12-16 21:15:58,737 : INFO : topic #291(8.279): -0.309*"هههههه" + 0.284*"انتو" + 0.254*"زمان" + 0.236*"اجل" + -0.229*"اخر" + -0.211*"ابي" + -0.198*"عيني" + 0.196*"•" + -0.181*"ههههه" + 0.170*"جماعه"
2017-12-16 21:15:58,738 : INFO : topic #292(8.259): -0.310*"تم" + 0.303*"زمان" + -0.270*"عيني" + -0.216*"جماعه" + -0.207*"مكان" + 0.202*"وانتي" + -0.183*"بخير" + -0.177*"ههههه" + -0.172*"إذا" + 0.157*"بـ"
2017-12-16 21:15:58,739 : INFO : topic #293(8.240): -0.365*"شلون" + 0.363*"هههههه" + -0.226*"مكان" + -0.214*"حرام" + -0.181*"وانتي" + 0.179*"إذا" + 0.173*"بخير" + 0.173*"ايش" + -0.172*"بلا" + -0.170*"بكل"
2017-12-16 21:15:58,741 : INFO : topic #294(8.218): 0.288*"عني" + -0.274*"حرام" + -0.192*"ابن" + -0.188*"ههههه" + 0.182*"لن" + 0.172*"يبقي" + 0.163*"الاتحاد" + 0.161*"ومن" + -0.154*"ياريت" + 0.153*"الجو"
2017-12-16 21:15:58,742 : INFO : topic #295(8.202): -0.323*"والفولورز" + -0.316*"بـ" + 0.255*"عيني" + 0.246*"فولورز" + -0.202*"الوكر" + 0.201*"ههههه" + 0.186*"الشتا" + -0.185*"تم" + -

2017-12-16 21:15:58,805 : INFO : topic #331(7.655): -0.495*"شر" + 0.351*"كفايه" + -0.185*"منهم" + 0.178*"وجهك" + 0.158*"مسا" + 0.155*"بلاش" + 0.148*"انها" + 0.146*"فولو" + 0.142*"؛" + -0.138*"معاه"
2017-12-16 21:15:58,806 : INFO : topic #332(7.623): 0.326*"انها" + 0.239*"أي" + -0.212*"بقيت" + -0.208*"اوى" + -0.198*"كفايه" + 0.198*"بدون" + 0.194*"ياخي" + -0.184*"الوقت" + -0.174*"ولله" + -0.173*"ثم"
2017-12-16 21:15:58,807 : INFO : topic #333(7.607): -0.434*"كفايه" + 0.275*"سنة" + 0.241*"؛" + 0.201*"كام" + -0.178*"شر" + 0.153*"تقريبا" + -0.152*"ماما" + -0.151*"يابني" + -0.141*"ولله" + -0.130*"بـ"
2017-12-16 21:15:58,808 : INFO : topic #334(7.587): -0.246*"ماما" + 0.227*"روحي" + -0.222*"ولو" + 0.218*"الصبح" + 0.213*"منهم" + -0.201*"تقول" + 0.195*"عندنا" + 0.163*"وربنا" + -0.158*"بلا" + -0.155*"اقول"
2017-12-16 21:15:58,812 : INFO : topic #335(7.568): -0.406*"بقيت" + 0.277*"ولاد" + 0.265*"عندنا" + -0.224*"؛" + -0.222*"معاه" + 0.183*"عنك" + -0.146*"ياخي" + 0.136*"كفاية" + 0.134*"ولو" + -0.1

2017-12-16 21:15:58,874 : INFO : topic #372(7.102): -0.293*"حياتك" + -0.226*"بيه" + 0.216*"البنات" + 0.201*"كوباية" + -0.189*"كويس" + 0.189*"عليكي" + -0.184*"صلاح" + 0.178*"يابني" + 0.157*"لـ" + 0.156*"الاول"
2017-12-16 21:15:58,875 : INFO : topic #373(7.084): 0.259*"منا" + -0.232*"الاول" + 0.213*"برضو" + 0.205*"انام" + 0.197*"كوباية" + 0.197*"صلاح" + -0.164*"انتوا" + -0.149*"هم" + -0.138*"كُل" + -0.126*"كويس"
2017-12-16 21:15:58,876 : INFO : topic #374(7.077): 0.386*"اها" + -0.291*"كوباية" + 0.215*"البنات" + 0.208*"برضو" + -0.182*"منو" + -0.168*"غلط" + 0.155*"تقول" + 0.155*"حياتك" + 0.152*"كسم" + 0.143*"صلاح"
2017-12-16 21:15:58,877 : INFO : topic #375(7.063): -0.603*"برضو" + 0.232*"حياتك" + 0.202*"الموضوع" + 0.184*"صلاح" + -0.151*"كُل" + 0.148*"|" + 0.130*"🤦‍️" + 0.107*"منا" + 0.099*"معاه" + 0.095*"لـ"
2017-12-16 21:15:58,878 : INFO : topic #376(7.047): -0.338*"عليكي" + -0.296*"كويس" + 0.204*"انتى" + 0.196*"الاول" + 0.182*"الموضوع" + 0.177*"انام" + 0.174*"طلعت" + -0.160*"ايوا" + 0.14

2017-12-16 21:15:58,946 : INFO : topic #412(6.607): -0.185*"تمام" + 0.177*"عارفة" + -0.161*"النهاردة" + -0.155*"باين" + 0.154*"مِن" + 0.151*"الـ" + -0.146*"امي" + 0.140*"صالح" + -0.134*"دايما" + 0.131*"نص"
2017-12-16 21:15:58,947 : INFO : topic #413(6.598): -0.386*"كسم" + 0.301*"عايزه" + -0.204*"واحدة" + 0.194*"زى" + -0.176*"بيها" + 0.156*"حياتك" + 0.150*"تعالي" + 0.141*"عمرو" + 0.140*"ينفع" + -0.134*"بقول"
2017-12-16 21:15:58,948 : INFO : topic #414(6.563): -0.283*"إني" + 0.253*"أيه" + 0.245*"خرا" + -0.199*"احمد" + 0.185*"عايزه" + 0.172*"مافي" + -0.166*"تعالي" + 0.141*"امي" + 0.138*"بلوك" + 0.132*"عنده"
2017-12-16 21:15:58,949 : INFO : topic #415(6.557): 0.327*"تمام" + -0.212*"زى" + 0.204*"أحبك" + 0.159*"تانى" + 0.157*"بقول" + 0.156*"حاضر" + -0.154*"ترا" + -0.154*"مثلا" + -0.152*"حصل" + 0.150*"تعالي"
2017-12-16 21:15:58,950 : INFO : topic #416(6.544): 0.213*"بلوك" + -0.202*"بالك" + 0.190*"عنده" + -0.187*"قلت" + -0.176*"أيه" + -0.143*"خد" + 0.137*"الموضوع" + 0.134*"طلع" + -0.132*"خرا" 

2017-12-16 21:15:59,012 : INFO : topic #451(6.186): 0.211*"التقاعد" + 0.209*"انتا" + 0.200*"سن" + -0.198*"تانى" + 0.192*"بقت" + 0.191*"ينفع" + 0.185*"النهاردة" + -0.161*"معرفش" + 0.159*"مطلب" + 0.156*"شعبي"
2017-12-16 21:15:59,013 : INFO : topic #452(6.169): -0.331*"نيك" + 0.242*"هوا" + -0.229*"اية" + -0.165*"فاهم" + 0.160*"طلع" + 0.148*"عايزين" + -0.139*"حصل" + 0.137*"عارفة" + 0.130*"مغسلة" + 0.117*"اسم"
2017-12-16 21:15:59,015 : INFO : topic #453(6.144): 0.216*"نيك" + 0.193*"ليا" + 0.188*"عايزه" + -0.177*"هوا" + -0.161*"طلع" + -0.146*"اية" + -0.145*"حبيبى" + 0.143*"اما" + -0.135*"قول" + -0.127*"زى"
2017-12-16 21:15:59,016 : INFO : topic #454(6.142): 0.208*"عارفة" + -0.207*"تانى" + 0.167*"نيك" + 0.164*"حصل" + 0.159*"مغسلة" + 0.150*"طلع" + 0.145*"مالك" + -0.139*"الشتا" + 0.138*"اللَّهَ" + -0.131*"امبارح"
2017-12-16 21:15:59,017 : INFO : topic #455(6.118): -0.285*"مغسلة" + 0.271*"امتي" + -0.210*"اكل" + -0.173*"يَ" + -0.166*"ينفع" + -0.158*"مالك" + -0.151*"البرد" + -0.150*"ها" + -0.145*"

2017-12-16 21:15:59,079 : INFO : topic #491(5.664): -0.253*"اللهُمَ" + -0.216*"الكراش" + 0.190*"للدعم" + -0.181*"أزاي" + -0.176*"الوطني" + -0.154*"رسالة" + 0.141*"اليمني" + -0.134*"فيلم" + 0.134*"احا" + -0.133*"فاكر"
2017-12-16 21:15:59,080 : INFO : topic #492(5.653): -0.222*"عيد" + 0.187*"اللهُمَ" + -0.156*"انى" + -0.137*"احا" + 0.131*"الخطيب" + 0.130*"وحش" + 0.130*"محمود" + 0.127*"عبد" + 0.125*"امبارح" + -0.122*"بفولورز"
2017-12-16 21:15:59,081 : INFO : topic #493(5.638): -0.333*"أزاي" + -0.241*"هى" + 0.224*"ناديك" + -0.206*"لوحدي" + -0.181*"اللَّهِ" + -0.176*"أكتر" + 0.165*"وحش" + 0.151*"اللَّهُ" + 0.140*"النهارده" + -0.135*"هههه"
2017-12-16 21:15:59,087 : INFO : topic #494(5.633): -0.425*"أزاي" + 0.294*"هى" + 0.217*"اللهُمَ" + 0.175*"هههه" + -0.147*"لوحدي" + 0.143*"انى" + 0.133*"للدعم" + -0.129*"الوطني" + -0.114*"احا" + 0.111*"فاكر"
2017-12-16 21:15:59,088 : INFO : topic #495(5.613): -0.260*"اللَّهُ" + -0.254*"اللهُمَ" + 0.235*"بيبي" + 0.234*"النهارده" + 0.224*"ناديك" + -0.166*"رسا

[   (   0,
        '0.343*"و" + 0.319*"من" + 0.314*"يا" + 0.271*"لا" + 0.251*"في" + '
        '0.250*"انا" + 0.217*"الله" + 0.170*"ما" + 0.163*"مش" + 0.154*"بس"'),
    (   1,
        '-0.922*"يا" + 0.187*"و" + 0.176*"لا" + 0.159*"من" + 0.101*"في" + '
        '0.093*"انا" + 0.054*"،" + 0.048*"اللي" + -0.046*"الله" + 0.042*"ما"'),
    (   2,
        '-0.606*"الله" + 0.556*"انا" + -0.241*"من" + 0.199*"مش" + 0.175*"كدا" '
        '+ 0.173*"بس" + 0.170*"والله" + -0.125*"في" + -0.106*"على" + '
        '-0.105*"لا"'),
    (   3,
        '0.796*"و" + -0.548*"لا" + -0.148*"من" + -0.114*"ولا" + -0.094*"انا" + '
        '-0.064*"والله" + -0.058*"بس" + 0.034*"الله" + -0.034*"على" + '
        '-0.033*"مش"'),
    (   4,
        '0.732*"لا" + -0.406*"من" + 0.401*"و" + -0.205*"في" + -0.113*"والله" + '
        '-0.104*"انا" + -0.095*"الله" + 0.094*"يا" + -0.083*"اللي" + '
        '-0.072*"على"'),
    (   5,
        '-0.723*"الله" + 0.419*"من" + -0.407*"انا" + 0.191*"في" + 0.133*"يا" + '
        '-0.125

    (   180,
        '-0.435*"اهو" + -0.338*"دلوقتي" + 0.323*"️️" + -0.306*"الف" + '
        '-0.287*"حلوه" + 0.237*"لكن" + -0.227*"لحد" + -0.209*"مبروك" + '
        '-0.199*"شيء" + 0.179*"كلام"'),
    (   181,
        '0.402*"اهو" + 0.332*"كلها" + 0.332*"️️" + -0.322*"ابو" + -0.273*"الف" '
        '+ -0.245*"ام" + -0.194*"مبروك" + 0.192*"حلوه" + 0.188*"دلوقتي" + '
        '-0.174*"حياتي"'),
    (   182,
        '0.300*"دلوقتي" + -0.281*"اهو" + 0.270*"لكن" + -0.251*"ام" + '
        '-0.247*"نفس" + 0.229*"الف" + 0.219*"يكون" + -0.201*"️️" + 0.195*"لحد" '
        '+ -0.193*"أو"'),
    (   183,
        '-0.526*"كلها" + 0.378*"دلوقتي" + 0.317*"ام" + 0.260*"لحد" + '
        '0.216*"️️" + 0.200*"لكن" + -0.199*"نفس" + 0.196*"حياتي" + 0.188*"مثل" '
        '+ 0.118*"ازاي"'),
    (   184,
        '0.546*"اهو" + -0.372*"دلوقتي" + -0.311*"لحد" + -0.244*"أو" + '
        '0.215*"مثل" + 0.207*"لكن" + 0.146*"يكون" + -0.140*"نفس" + '
        '-0.135*"قطر" + 0.133*"دول"'),
    (   185,
        '0.621*"

In [143]:
for doc in corpus_lsi[:10]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
     print(doc)

[(0, 0.05841395731105091), (1, 0.013593496850437047), (2, 0.0061198794851644155), (3, -0.01447414259572152), (4, -0.008280608132771549), (5, 0.0062367087523176254), (6, 0.019139969357567066), (7, -0.004245542112921984), (8, 0.026377850842343099), (9, 0.024736619650756523), (10, -0.014630799469579311), (11, -0.011473861209234625), (12, 0.073250223740509476), (13, 0.0019999731996024994), (14, -0.040414410407985497), (15, -0.069994613999086824), (16, 0.009938185185321264), (17, 0.0079632052033616549), (18, -0.022690446588878102), (19, -0.034415570454015942), (20, -0.019107042127638425), (21, -0.015157446632334216), (22, -0.013310254149504578), (23, 0.046365859035252324), (24, 0.015283900047049546), (25, -0.0093278465456094212), (26, -0.0095293213105807021), (27, -0.011169261217395818), (28, -0.037211522886500384), (29, -0.012092098143138075), (30, 0.013252940352785871), (31, -0.010093771190151198), (32, 0.0072232140844472586), (33, -0.0057486302046392103), (34, 0.012675806737329952), (35,

Notice how 0,1 now represent `num_topics=2` (dimensions) and not `word_id` as was the case with doctobow. 

#### Save the model for later use. 

In [96]:
lsi.save('../topic_modeling_experiments/lsa_lda/model_lsi.pkl') # same for tfidf, lda, ...
#lsi = models.LsiModel.load('../topic_modeling_experiments/lsa_lda/model_lsi.pkl')

2017-12-16 06:57:53,392 : INFO : saving Projection object under ../topic_modeling_experiments/lsa_lda/model.lsi.projection, separately None
2017-12-16 06:57:53,395 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.lsi.projection
2017-12-16 06:57:53,396 : INFO : saving LsiModel object under ../topic_modeling_experiments/lsa_lda/model.lsi, separately None
2017-12-16 06:57:53,397 : INFO : not storing attribute projection
2017-12-16 06:57:53,398 : INFO : not storing attribute dispatcher
2017-12-16 06:57:53,401 : INFO : saved ../topic_modeling_experiments/lsa_lda/model.lsi


#### Find cosine similarities between lsi topics and a query

In [97]:
index = similarities.MatrixSimilarity(corpus_lsi) # index lsi corpus

2017-12-16 07:32:16,284 : INFO : creating matrix with 179632 documents and 2 features


In [98]:
index.save('../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index')
#index = similarities.MatrixSimilarity.load('../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index')

2017-12-16 07:36:03,789 : INFO : saving MatrixSimilarity object under ../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index, separately None
2017-12-16 07:36:03,809 : INFO : saved ../topic_modeling_experiments/lsa_lda/lsi_eg_gulf.index


#### Turn query into bow and use lsi to project it onto a 2-d vector space (topics)

In [99]:
doc = "مليش دعوه بحد بصراحه ولا نادى ليه دعوه بصفقات حد السوق"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 0.25610017004210817), (1, -0.065892777552506498)]


In [101]:
sims = index[vec_lsi]

In [115]:
sims.shape

(179632,)

In [107]:
sims = index[vec_lsi] # perform a similarity query against the corpus
pp.pprint(list(enumerate(sims[:10]))) # print (document_number, document_similarity) 2-tuples

[   (0, 0.97553933),
    (1, 0.979222),
    (2, 0.99163806),
    (3, 0.97607386),
    (4, 0.99301887),
    (5, 0.96669626),
    (6, 0.98321593),
    (7, 0.56111413),
    (8, 0.56111413),
    (9, 0.99779427)]


The first 10 documents (tweets) are labelled with class `EG` and the query itself is written in `EG` dialect. The results show high similarity with most documents but two possible misclassifications. 

In [109]:
sims[:10]

array([ 0.97553933,  0.979222  ,  0.99163806,  0.97607386,  0.99301887,
        0.96669626,  0.98321593,  0.56111413,  0.56111413,  0.99779427], dtype=float32)

#### Find the transformation from tf-idf to latent space with Random Projections, RP

In [119]:
rp = models.RpModel(corpus_tfidf, num_topics=2)

2017-12-16 17:54:56,429 : INFO : no word id mapping provided; initializing from corpus, assuming identity
2017-12-16 17:54:59,474 : INFO : constructing (2, 5000) random matrix


In [None]:
rp.save('../topic_modeling_experiments/lsa_lda/model_rp.pkl') 
#rp = models.RpModel.load('../topic_modeling_experiments/lsa_lda/model_rp.pkl')

In [120]:
corpus_rp = rp[corpus_tfidf]

In [121]:
for doc in corpus_rp[:4]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
     print(doc)

[(0, -1.9482136964797974), (1, -0.67955482006073)]
[(0, 0.05010877549648285), (1, -0.4178534150123596)]
[(0, -1.1827576160430908), (1, 0.3307190239429474)]
[(0, -1.1580445766448975), (1, 1.347662091255188)]


In [122]:
hdp = models.HdpModel(corpus, id2word=dictionary)

2017-12-16 18:19:09,617 : INFO : (0, '0.009*، + 0.004*مش + 0.004*من + 0.002*فـ + 0.002*أنا + 0.002*بس + 0.002*في + 0.002*ده + 0.002*هو + 0.002*رسمي')
2017-12-16 18:19:09,625 : INFO : (1, '0.003*في + 0.002*مِنَ + 0.002*ضمن + 0.002*الخبر + 0.002*تعبت + 0.002*يهتم + 0.002*أنه + 0.001*كده + 0.001*حادث + 0.001*منذ')
2017-12-16 18:19:09,632 : INFO : (2, '0.002*في + 0.002*اسوأ + 0.002*أصل + 0.001*داهيه + 0.001*وصلت + 0.001*يلعن + 0.001*المشهد + 0.001*اقسم + 0.001*نيك + 0.001*فقدت')
2017-12-16 18:19:09,639 : INFO : (3, '0.002*لسا + 0.002*بني + 0.002*، + 0.002*وقعت + 0.001*أنتا + 0.001*بلدي + 0.001*“ + 0.001*عظيم + 0.001*اغير + 0.001*الساعة')
2017-12-16 18:19:09,646 : INFO : (4, '0.002*جدع + 0.002*لكم + 0.001*عندنا + 0.001*نفرح + 0.001*الحاجة + 0.001*اختار + 0.001*يناير + 0.001*دولة + 0.001*التويتر + 0.001*و')
2017-12-16 18:19:09,652 : INFO : (5, '0.002*، + 0.001*وشكرا + 0.001*في + 0.001*الخليجيه + 0.001*يا + 0.001*يبقي + 0.001*🤗 + 0.001*دي + 0.001*عشانك + 0.001*واجعل')
2017-12-16 18:19:09,659 

2017-12-16 18:20:05,400 : INFO : (0, '0.011*، + 0.010*من + 0.010*مش + 0.009*و + 0.007*انا + 0.006*كدا + 0.006*في + 0.005*بس + 0.005*اللي + 0.004*حد')
2017-12-16 18:20:05,407 : INFO : (1, '0.003*في + 0.002*مِنَ + 0.002*و + 0.002*من + 0.002*ضمن + 0.002*مش + 0.002*الخبر + 0.002*تعبت + 0.002*كده + 0.002*يهتم')
2017-12-16 18:20:05,414 : INFO : (2, '0.002*في + 0.002*من + 0.002*و + 0.002*، + 0.002*اسوأ + 0.002*أصل + 0.002*مش + 0.001*داهيه + 0.001*وصلت + 0.001*يلعن')
2017-12-16 18:20:05,421 : INFO : (3, '0.002*، + 0.002*لسا + 0.002*بني + 0.002*و + 0.002*في + 0.002*وقعت + 0.002*من + 0.001*ما + 0.001*يا + 0.001*أنتا')
2017-12-16 18:20:05,428 : INFO : (4, '0.003*، + 0.003*من + 0.002*و + 0.002*كل + 0.002*في + 0.001*ده + 0.001*يملك + 0.001*مش + 0.001*كيفك + 0.001*أن')
2017-12-16 18:20:05,435 : INFO : (5, '0.002*و + 0.002*من + 0.002*في + 0.002*جدع + 0.001*لكم + 0.001*عندنا + 0.001*نفرح + 0.001*الحاجة + 0.001*اختار + 0.001*،')
2017-12-16 18:20:05,442 : INFO : (6, '0.002*في + 0.002*، + 0.002*و + 0.002

2017-12-16 18:21:01,114 : INFO : (2, '0.003*كدا + 0.003*في + 0.002*و + 0.002*من + 0.002*مش + 0.002*، + 0.002*اسوأ + 0.002*يا + 0.002*أصل + 0.001*الله')
2017-12-16 18:21:01,121 : INFO : (3, '0.003*كدا + 0.003*من + 0.003*، + 0.003*و + 0.002*في + 0.002*مش + 0.002*كل + 0.002*يا + 0.002*انا + 0.002*لا')
2017-12-16 18:21:01,128 : INFO : (4, '0.003*كدا + 0.002*و + 0.002*، + 0.002*من + 0.002*في + 0.002*يا + 0.002*لسا + 0.002*بني + 0.002*ما + 0.002*وقعت')
2017-12-16 18:21:01,135 : INFO : (5, '0.004*كدا + 0.003*و + 0.003*من + 0.003*في + 0.002*يا + 0.002*مش + 0.002*بس + 0.002*اللي + 0.002*، + 0.002*لو')
2017-12-16 18:21:01,141 : INFO : (6, '0.003*كدا + 0.003*و + 0.003*في + 0.002*من + 0.002*لا + 0.002*اللي + 0.002*بس + 0.002*مش + 0.002*، + 0.002*"')
2017-12-16 18:21:01,148 : INFO : (7, '0.003*كدا + 0.003*من + 0.002*و + 0.002*كريم + 0.002*في + 0.002*ف + 0.002*مش + 0.002*، + 0.002*يا + 0.002*مع')
2017-12-16 18:21:01,155 : INFO : (8, '0.003*كدا + 0.002*و + 0.002*في + 0.002*من + 0.002*انا + 0.002*يا +

2017-12-16 18:22:54,333 : INFO : (0, '0.020*كدا + 0.019*و + 0.017*مش + 0.016*انا + 0.016*من + 0.013*في + 0.011*بس + 0.010*، + 0.009*اللي + 0.009*ف')
2017-12-16 18:22:54,340 : INFO : (1, '0.004*في + 0.004*و + 0.003*من + 0.003*كدا + 0.003*مش + 0.003*انا + 0.003*يا + 0.002*اللي + 0.002*مِنَ + 0.002*ولا')
2017-12-16 18:22:54,346 : INFO : (2, '0.004*من + 0.004*و + 0.003*كدا + 0.003*، + 0.003*في + 0.003*مش + 0.003*انا + 0.003*يا + 0.002*كل + 0.002*لا')
2017-12-16 18:22:54,353 : INFO : (3, '0.004*في + 0.004*و + 0.003*من + 0.003*كدا + 0.003*مش + 0.003*انا + 0.003*يا + 0.002*، + 0.002*لا + 0.002*كل')
2017-12-16 18:22:54,360 : INFO : (4, '0.004*و + 0.004*من + 0.004*كدا + 0.004*يا + 0.003*في + 0.003*مش + 0.003*انا + 0.002*اللي + 0.002*بس + 0.002*لا')
2017-12-16 18:22:54,366 : INFO : (5, '0.004*و + 0.004*في + 0.003*كدا + 0.003*من + 0.003*مش + 0.003*لا + 0.003*اللي + 0.003*انا + 0.003*يا + 0.003*بس')
2017-12-16 18:22:54,373 : INFO : (6, '0.004*من + 0.003*و + 0.003*كدا + 0.003*في + 0.003*مش + 0.003*

2017-12-16 18:23:52,924 : INFO : (0, '0.021*و + 0.017*مش + 0.017*كدا + 0.017*من + 0.016*انا + 0.014*في + 0.011*بس + 0.010*اللي + 0.009*، + 0.008*يا')
2017-12-16 18:23:52,932 : INFO : (1, '0.005*في + 0.004*و + 0.004*من + 0.003*مش + 0.003*كدا + 0.003*يا + 0.003*انا + 0.003*اللي + 0.002*بس + 0.002*ولا')
2017-12-16 18:23:52,938 : INFO : (2, '0.005*من + 0.005*و + 0.004*في + 0.003*، + 0.003*كدا + 0.003*مش + 0.003*يا + 0.003*انا + 0.003*كل + 0.002*لا')
2017-12-16 18:23:52,945 : INFO : (3, '0.005*و + 0.004*من + 0.004*يا + 0.004*في + 0.004*مش + 0.004*كدا + 0.003*انا + 0.003*اللي + 0.002*بس + 0.002*لا')
2017-12-16 18:23:52,952 : INFO : (4, '0.004*في + 0.004*و + 0.004*من + 0.003*مش + 0.003*كدا + 0.003*يا + 0.003*انا + 0.002*، + 0.002*لا + 0.002*اللي')
2017-12-16 18:23:52,958 : INFO : (5, '0.005*و + 0.004*في + 0.004*من + 0.003*كدا + 0.003*مش + 0.003*يا + 0.003*اللي + 0.003*انا + 0.003*لا + 0.003*بس')
2017-12-16 18:23:52,965 : INFO : (6, '0.004*من + 0.004*و + 0.004*في + 0.003*كدا + 0.003*مش + 0.003

2017-12-16 18:24:53,155 : INFO : (18, '0.005*من + 0.004*و + 0.004*في + 0.003*يا + 0.003*مش + 0.003*انا + 0.003*كدا + 0.002*ولا + 0.002*بس + 0.002*ما')
2017-12-16 18:24:53,162 : INFO : (19, '0.004*و + 0.004*في + 0.004*من + 0.004*كدا + 0.004*انا + 0.003*مش + 0.003*يا + 0.003*اللي + 0.002*لا + 0.002*بس')
2017-12-16 18:24:53,164 : INFO : PROGRESS: finished document 69888 of 179632
2017-12-16 18:25:53,640 : INFO : (0, '0.021*و + 0.019*من + 0.018*مش + 0.018*انا + 0.016*في + 0.015*كدا + 0.011*بس + 0.010*اللي + 0.009*، + 0.009*يا')
2017-12-16 18:25:53,648 : INFO : (1, '0.006*في + 0.005*من + 0.005*و + 0.004*مش + 0.004*يا + 0.004*انا + 0.003*كدا + 0.003*اللي + 0.003*لا + 0.003*بس')
2017-12-16 18:25:53,655 : INFO : (2, '0.006*من + 0.005*و + 0.004*في + 0.004*يا + 0.004*مش + 0.004*انا + 0.004*، + 0.003*كدا + 0.003*كل + 0.003*لا')
2017-12-16 18:25:53,661 : INFO : (3, '0.006*من + 0.005*و + 0.005*يا + 0.005*في + 0.004*مش + 0.004*انا + 0.003*كدا + 0.003*اللي + 0.003*بس + 0.003*لا')
2017-12-16 18:25:53,

2017-12-16 18:28:00,334 : INFO : (0, '0.022*و + 0.019*من + 0.019*مش + 0.018*انا + 0.017*في + 0.013*كدا + 0.011*بس + 0.011*اللي + 0.009*يا + 0.009*،')
2017-12-16 18:28:00,341 : INFO : (1, '0.007*في + 0.006*من + 0.006*و + 0.005*يا + 0.004*مش + 0.004*انا + 0.003*اللي + 0.003*لا + 0.003*كدا + 0.003*بس')
2017-12-16 18:28:00,352 : INFO : (2, '0.007*من + 0.006*و + 0.005*في + 0.005*يا + 0.004*مش + 0.004*انا + 0.004*، + 0.003*لا + 0.003*كل + 0.003*كدا')
2017-12-16 18:28:00,359 : INFO : (3, '0.007*من + 0.006*و + 0.006*في + 0.006*يا + 0.005*مش + 0.004*انا + 0.003*كدا + 0.003*اللي + 0.003*بس + 0.003*لا')
2017-12-16 18:28:00,366 : INFO : (4, '0.006*من + 0.006*في + 0.006*و + 0.004*يا + 0.004*مش + 0.004*انا + 0.003*لا + 0.003*كدا + 0.003*اللي + 0.003*الله')
2017-12-16 18:28:00,372 : INFO : (5, '0.007*من + 0.006*و + 0.005*في + 0.005*يا + 0.004*مش + 0.004*انا + 0.004*، + 0.003*بس + 0.003*كدا + 0.003*اللي')
2017-12-16 18:28:00,379 : INFO : (6, '0.007*و + 0.007*من + 0.006*في + 0.005*يا + 0.004*انا + 0.00

2017-12-16 18:30:12,586 : INFO : (2, '0.008*في + 0.008*من + 0.006*و + 0.005*يا + 0.004*انا + 0.004*مش + 0.004*لا + 0.003*اللي + 0.003*بس + 0.003*ما')
2017-12-16 18:30:12,593 : INFO : (3, '0.008*من + 0.007*و + 0.007*في + 0.005*يا + 0.004*انا + 0.004*لا + 0.004*مش + 0.004*، + 0.003*اللي + 0.003*على')
2017-12-16 18:30:12,600 : INFO : (4, '0.008*من + 0.007*و + 0.007*في + 0.006*يا + 0.004*مش + 0.004*انا + 0.004*لا + 0.003*اللي + 0.003*بس + 0.003*على')
2017-12-16 18:30:12,607 : INFO : (5, '0.008*من + 0.007*و + 0.006*في + 0.005*يا + 0.004*انا + 0.004*، + 0.004*مش + 0.004*لا + 0.003*على + 0.003*بس')
2017-12-16 18:30:12,613 : INFO : (6, '0.008*من + 0.007*في + 0.006*و + 0.005*يا + 0.004*ما + 0.004*انا + 0.004*مش + 0.004*لا + 0.003*اللي + 0.003*كل')
2017-12-16 18:30:12,620 : INFO : (7, '0.008*من + 0.007*في + 0.006*و + 0.005*يا + 0.004*مش + 0.004*انا + 0.004*لا + 0.003*على + 0.003*ما + 0.003*بس')
2017-12-16 18:30:12,626 : INFO : (8, '0.007*من + 0.007*في + 0.006*و + 0.004*يا + 0.004*مش + 0.004*انا 

2017-12-16 18:32:23,051 : INFO : (5, '0.009*من + 0.008*في + 0.006*و + 0.005*يا + 0.005*ما + 0.004*انا + 0.004*لا + 0.004*الله + 0.004*مش + 0.004*على')
2017-12-16 18:32:23,058 : INFO : (6, '0.010*من + 0.007*في + 0.007*و + 0.005*يا + 0.005*، + 0.004*انا + 0.004*لا + 0.004*على + 0.004*مش + 0.004*بس')
2017-12-16 18:32:23,064 : INFO : (7, '0.009*من + 0.007*في + 0.006*و + 0.005*يا + 0.004*لا + 0.004*انا + 0.004*مش + 0.004*على + 0.004*ما + 0.003*بس')
2017-12-16 18:32:23,071 : INFO : (8, '0.009*من + 0.008*في + 0.006*و + 0.004*يا + 0.004*لا + 0.004*انا + 0.004*مش + 0.004*الله + 0.003*، + 0.003*كل')
2017-12-16 18:32:23,078 : INFO : (9, '0.009*من + 0.008*في + 0.007*و + 0.005*لا + 0.005*يا + 0.004*انا + 0.004*اللي + 0.004*ما + 0.004*مش + 0.004*بس')
2017-12-16 18:32:23,084 : INFO : (10, '0.010*من + 0.008*في + 0.007*و + 0.005*يا + 0.004*انا + 0.004*لا + 0.004*مش + 0.004*الله + 0.004*على + 0.004*كل')
2017-12-16 18:32:23,091 : INFO : (11, '0.008*من + 0.007*في + 0.007*و + 0.005*انا + 0.005*يا + 0.004*ل

2017-12-16 18:34:37,449 : INFO : (15, '0.011*من + 0.008*في + 0.007*و + 0.005*يا + 0.005*لا + 0.004*انا + 0.004*ما + 0.004*على + 0.004*مش + 0.004*الله')
2017-12-16 18:34:37,456 : INFO : (16, '0.010*من + 0.008*في + 0.006*و + 0.005*يا + 0.005*لا + 0.004*انا + 0.004*ما + 0.004*على + 0.004*الله + 0.003*مش')
2017-12-16 18:34:37,462 : INFO : (17, '0.010*من + 0.009*في + 0.007*و + 0.005*يا + 0.005*لا + 0.005*انا + 0.004*ما + 0.004*مش + 0.004*على + 0.004*الله')
2017-12-16 18:34:37,469 : INFO : (18, '0.010*من + 0.009*في + 0.007*و + 0.006*يا + 0.005*لا + 0.004*ما + 0.004*انا + 0.004*الله + 0.004*مش + 0.004*،')
2017-12-16 18:34:37,476 : INFO : (19, '0.010*من + 0.008*في + 0.006*و + 0.005*يا + 0.005*انا + 0.005*لا + 0.004*على + 0.004*ما + 0.004*الله + 0.004*اللي')
2017-12-16 18:34:37,478 : INFO : PROGRESS: finished document 159744 of 179632
2017-12-16 18:35:45,425 : INFO : (0, '0.022*و + 0.021*من + 0.018*في + 0.017*انا + 0.017*مش + 0.012*كدا + 0.012*اللي + 0.011*بس + 0.010*، + 0.009*ما')
2017-12-16 1

2017-12-16 18:36:56,563 : INFO : (3, '0.011*من + 0.010*في + 0.007*و + 0.005*لا + 0.005*يا + 0.005*الله + 0.004*انا + 0.004*على + 0.004*ما + 0.004*اللي')
2017-12-16 18:36:56,570 : INFO : (4, '0.012*من + 0.009*في + 0.007*و + 0.006*يا + 0.005*لا + 0.005*على + 0.004*ما + 0.004*انا + 0.004*الله + 0.004*اللي')
2017-12-16 18:36:56,577 : INFO : (5, '0.012*من + 0.009*في + 0.007*و + 0.005*، + 0.005*لا + 0.005*على + 0.005*يا + 0.005*الله + 0.005*ما + 0.004*انا')
2017-12-16 18:36:56,584 : INFO : (6, '0.012*من + 0.010*في + 0.007*و + 0.005*ما + 0.005*لا + 0.005*يا + 0.005*الله + 0.005*على + 0.004*انا + 0.004*،')
2017-12-16 18:36:56,591 : INFO : (7, '0.011*من + 0.009*في + 0.007*و + 0.005*لا + 0.005*يا + 0.005*انا + 0.005*الله + 0.004*على + 0.004*ما + 0.004*بس')
2017-12-16 18:36:56,598 : INFO : (8, '0.012*من + 0.010*في + 0.007*و + 0.006*يا + 0.005*لا + 0.005*الله + 0.005*على + 0.005*انا + 0.004*ما + 0.004*كل')
2017-12-16 18:36:56,604 : INFO : (9, '0.011*من + 0.009*في + 0.007*و + 0.005*لا + 0.005*على +

In [123]:
#print hdp results

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier

In [49]:
le = LabelEncoder()
df['class_numerical'] = le.fit_transform(df['class'])

In [50]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['class_numerical'], random_state = 42)

In [39]:
from gensim import sklearn_api

In [67]:
sk_ld = gensim.sklearn_api.ldamodel.
#ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary)

In [68]:
type(sk_ld)

module

In [78]:
sk_ld_pipe = Pipeline([
                        ('sk', sk_ld)
                        ('clf', RidgeClassifier())
                    ])

sk_ld_pipe.fit(X_train, y_train)

TypeError: 'tuple' object is not callable

In [84]:
from __future__ import division

import graphlab as gl
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab

pyLDAvis.enable_notebook()

ModuleNotFoundError: No module named 'graphlab'

In [71]:
type(corpus)

gensim.corpora.mmcorpus.MmCorpus

In [93]:
vis_data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(vis_data, 'data/lda_75_lem_5_pass.html')

NameError: name 'sims' is not defined

In [77]:
from IPython.display import display

words_data =  pyLDAvis.gensim.prepare(lda,corpus, dictionary)
display(pyLDAvis.display(words_data))


IndexError: index 5000 is out of bounds for axis 1 with size 5000

In [40]:
from gensim.summarization.bm25 import get_bm25_weights

In [112]:
bm25_result = get_bm25_weights(corpus_tfidf)

KeyboardInterrupt: 

For later use

In [None]:
# class MyCorpus(object):
#     def __iter__(self):
#         for line in open('datasets/mycorpus.txt'):
#             # assume there's one document per line, tokens separated by whitespace
#             yield dictionary.doc2bow(line.lower().split())

In [None]:
#corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)