In [None]:
import pandas as pd
import numpy as np 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

import pyLDAvis.sklearn

In [3]:
df = pd.read_csv("p_skytrax.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,author_review_count,rating,title,date,location,body,Seat Type,Aircraft Type,Seat Layout,...,airline_slug,airline,Viewing Tv Screen,Power Supply,Sleep Comfort,Sitting Comfort,Seat/bed Width,Seat/bed Length,Seat Privacy,cleaned_body
0,0,0.0,0.3,"""seat was not comfortable""",2019-07-22,Malaysia,With their newer A320 aircraft leg room is awf...,Economy Class,A320,3x3,...,airasia,AirAsia,,,,,,,,newer aircraft leg room awful seat comfortable...
1,1,0.0,0.6,"""Check in was easy""",2019-04-06,Australia,Check in was easy and boarding I was in zone o...,Economy Class,A320,3x3,...,airasia,AirAsia,,,,,,,,check easy boarding zone think paid check kg l...
2,2,5.0,0.7,"""A solid experience""",2019-04-01,Australia,"A solid experience from start to finish, espec...",Economy Class,A320,3x3,...,airasia,AirAsia,,,,,,,,solid experience start finish especially given...
3,3,0.0,0.1,"""my 4yr old and I allocated different seats""",2018-01-14,Australia,I discovered upon boarding that my four-year-o...,Economy Class,A330-300,3x3x3,...,airasia,AirAsia,,,,,,,,discovered upon boarding four year old allocat...
4,4,1.0,0.7,"""space between was relatively good""",2017-08-24,Indonesia,"I got 14A seat, near emergency exit door. So t...",Economy Class,A320,3x3,...,airasia,AirAsia,,,,,,,,got seat near emergency exit different standar...


In [7]:
# Count Vectorizer
vect = CountVectorizer()  
vectors = vect.fit_transform(df.cleaned_body)

In [13]:
# Parameters tuning using Grid Search
grid_params = {'n_components' : list(range(2,8,2))}
# LDA model
lda = LatentDirichletAllocation()
lda_model = GridSearchCV(lda,param_grid=grid_params, verbose=3)
lda_model.fit(vectors)
# Estimators for LDA model
lda_model1 = lda_model.best_estimator_


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END .................................n_components=2; total time=  49.4s
[CV 2/5] END .................................n_components=2; total time=  57.4s
[CV 3/5] END .................................n_components=2; total time=  36.2s
[CV 4/5] END .................................n_components=2; total time=  55.8s
[CV 5/5] END .................................n_components=2; total time=  59.3s
[CV 1/5] END .................................n_components=4; total time=  40.3s
[CV 2/5] END .................................n_components=4; total time=  36.6s
[CV 3/5] END .................................n_components=4; total time=  46.8s
[CV 4/5] END .................................n_components=4; total time=  45.7s
[CV 5/5] END .................................n_components=4; total time=  56.0s
[CV 1/5] END .................................n_components=6; total time=  41.0s
[CV 2/5] END .................................n_c

NameError: name 'document_term_matrix' is not defined

In [14]:
print("Best LDA model's params" , lda_model.best_params_)
print("Best log likelihood Score for the LDA model",lda_model.best_score_)
print("LDA model Perplexity on train data", lda_model1.perplexity(vectors))

Best LDA model's params {'n_components': 2}
Best log likelihood Score for the LDA model -179934.286061876
LDA model Perplexity on train data 1149.5330035965799


In [23]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model1, vectors,vect,mds='tsne')

  and should_run_async(code)


With just two topics it does not work well, we do not have two distinguished topics, plus I think it could work better with a TFIDF matrix instead of a simple count matrix.

# Try with more hyperparemter tuning 

In [None]:
grid_params = {'n_components' : list(range(2,6))}
# LDA model
lda = LatentDirichletAllocation()
lda_model = GridSearchCV(lda,param_grid=grid_params, verbose=5)
lda_model.fit(vectors)
# Estimators for LDA model
lda_model2 = lda_model.best_estimator_

  and should_run_async(code)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END .................................n_components=2; total time=  45.7s
[CV 2/5] END .................................n_components=2; total time=  53.7s
[CV 3/5] END .................................n_components=2; total time=  49.3s
[CV 4/5] END .................................n_components=2; total time=  57.1s
[CV 5/5] END .................................n_components=2; total time=  44.0s
[CV 1/5] END .................................n_components=3; total time=  35.6s
[CV 2/5] END .................................n_components=3; total time=  44.7s
[CV 3/5] END .................................n_components=3; total time=  35.5s


In [None]:
print("Best LDA model's params" , lda_model.best_params_)
print("Best log likelihood Score for the LDA model",lda_model.best_score_)
print("LDA model Perplexity on train data", lda_model2.perplexity(vectors))

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model2, vectors,vect,mds='tsne')

# Try using TFIDF Matrix instead of word count

In [None]:
# Count Vectorizer
tfidf = TfidfVectorizer()  
tfidf_matrix = tfidf.fit_transform(df.cleaned_body)

In [None]:
grid_params = {'n_components' : list(range(2,6))}
# LDA model
lda = LatentDirichletAllocation()
lda_model = GridSearchCV(lda,param_grid=grid_params, verbose=5)
lda_model.fit(tfidf_matrix)
# Estimators for LDA model
lda_model3 = lda_model.best_estimator_

In [None]:
print("Best LDA model's params" , lda_model.best_params_)
print("Best log likelihood Score for the LDA model",lda_model.best_score_)
print("LDA model Perplexity on train data", lda_model3.perplexity(vectors))

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model3, tfidf_matrix,tfidf,mds='tsne')