#### What words come up most frequently in positive reviews? and in negative reviews? 

In [1]:
from sklearn.model_selection import cross_val_score 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings

from gensim import corpora, models, matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import defaultdict

from nltk.corpus import stopwords
import nltk

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
# we open the file for reading
fileObject1 = open("df_air.pickle",'rb')
fileObject2 = open("df_main.pickle",'rb')
fileObject3 = open("star.pickle",'rb')
fileObject4 = open("budget.pickle",'rb')

# load the object from the file into variables
df_air = pickle.load(fileObject1)  
df = pickle.load(fileObject2) 
star = pickle.load(fileObject3) 
budget = pickle.load(fileObject4) 

In [3]:
# Use stop word from NLTK
nltk.download('stopwords')
nltk_stops = stopwords.words()

# Use stop word from SKLearn
custom_stop_words = list(ENGLISH_STOP_WORDS)
custom_stop_words.append('flight')
custom_stop_words.append("flights")
custom_stop_words.append("fly")
custom_stop_words.append("flew")
custom_stop_words.append("airline")
custom_stop_words.append("airlines")
custom_stop_words.append("just")
custom_stop_words.append("did")
custom_stop_words.append("told")

# Fit and Transform using sklearn stop word.
vectorizer = CountVectorizer(stop_words=custom_stop_words)
X = vectorizer.fit_transform(df['content'])

# Use Vocabulary - parameter of CountVectorizer
# `.vocabulary_` attribute of the vectorizer contains a dictionary of terms. 
vectorizer.vocabulary_
vectorizer.get_feature_names()

# Get counts of tokens.
docs = pd.DataFrame(X.todense(), 
                    columns=vectorizer.get_feature_names())
docs.sum().sort_values(ascending=False).head(25)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


service          22501
time             21456
good             20292
food             19782
seats            19099
seat             16700
crew             15643
staff            15384
plane            13139
class            12650
cabin            12532
return           12153
check            11589
hours            10912
business         10421
airport           9761
passengers        8742
entertainment     8711
hour              8674
economy           8668
air               8138
friendly          7895
new               7809
comfortable       7596
boarding          7506
dtype: int64

print(len(vectorizer.vocabulary_))
print(vectorizer.vocabulary_)
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

In [4]:
# This is the fastest way to swap a dictionary key / value order.  
# This is the format gensim LDA expects it's vocabulary.
vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
print(len(vocab))
print(vocab)

35326


In [5]:
frequency = defaultdict(int)
for text in df['content']:
    for token in text.split():
        frequency[token] += 1        
frequency

defaultdict(int,
            {'Outbound': 559,
             'flight': 48563,
             'FRA/PRN': 1,
             'A319.': 120,
             '2': 6401,
             'hours': 8833,
             '10': 2119,
             'min': 684,
             'flight.': 8848,
             'I': 73124,
             'thought': 1050,
             'drinks/snacks': 11,
             'for': 50489,
             'sale': 122,
             'but': 30662,
             'sandwich': 859,
             'soft': 585,
             'drinks': 4221,
             'were': 46530,
             'served': 4294,
             'complimentary.': 35,
             'Inbound': 162,
             'flights': 12275,
             'SKP/LJU/FRA': 1,
             'CRJ900.': 12,
             'each': 1958,
             '1': 2543,
             'hour': 7631,
             '30': 2057,
             'Skyshop': 1,
             'menu': 794,
             'was': 103466,
             'in': 64406,
             'a': 91418,
             'seat': 13155,
         

In [6]:
texts = [[token for token in text.split() if frequency[token] > 1 and token not in nltk_stops]
          for text in df['content']]
texts

[['Outbound',
  'flight',
  'A319.',
  '2',
  'hours',
  '10',
  'flight.',
  'I',
  'thought',
  'drinks/snacks',
  'sandwich',
  'soft',
  'drinks',
  'served',
  'complimentary.',
  'Inbound',
  'flights',
  'CRJ900.',
  '1',
  'hour',
  '30',
  'flight.',
  'menu',
  'seat',
  'pocket',
  'drinks/snacks',
  'sale.',
  'All',
  'flight',
  'crews',
  'friendly.',
  'Security',
  'check',
  'Ljubljana',
  'airport',
  'transit',
  'passengers',
  'chaos',
  'however',
  'possible',
  'go',
  'gate',
  'within',
  '30min.'],
 ['Two',
  'short',
  'hops',
  'Very',
  'fast',
  'CRJ',
  '90.',
  'Seats',
  'comfortable',
  'crew',
  'fine.',
  'Ground',
  'services',
  'good',
  'lounge',
  'quite',
  'nice.'],
 ['Flew',
  'newish',
  'CRJ900.',
  'Flight',
  'almost',
  'full',
  'departure',
  'time.',
  'Service',
  'board',
  '2',
  'pleasant',
  'friendly',
  'female',
  'flight',
  'attendants.',
  'I',
  'bought',
  'coffee',
  'mineral',
  'water',
  'nice',
  'fresh',
  'filled

In [7]:
# Create gensim dictionary object
dictionary = corpora.Dictionary(texts)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x12f932ae320>

In [8]:
# Create corpus matrix
corpus = [dictionary.doc2bow(text) for text in texts]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 2),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1)],
 [(41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1)],
 [(2, 1),
  (7, 1),
  (8, 1),
  (20, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1)],
 [(10, 1),
  (20, 1),
  (35, 1),
  (47, 1),
  (51, 1)

In [74]:
lda = models.LdaModel(
    matutils.Sparse2Corpus(X, documents_columns=False),
    num_topics  =  10,
    passes      =  50,
    id2word     =  vocab
)

In [75]:
for topic in lda.print_topics():
    print(topic[1])
    print('')

0.032*"check" + 0.019*"bag" + 0.017*"pay" + 0.016*"boarding" + 0.015*"luggage" + 0.013*"extra" + 0.012*"baggage" + 0.012*"checked" + 0.011*"bags" + 0.011*"online"

0.027*"good" + 0.026*"time" + 0.021*"crew" + 0.016*"friendly" + 0.016*"cabin" + 0.014*"check" + 0.013*"service" + 0.013*"staff" + 0.011*"food" + 0.011*"return"

0.023*"hours" + 0.017*"airport" + 0.017*"delayed" + 0.016*"plane" + 0.015*"time" + 0.013*"hour" + 0.012*"minutes" + 0.011*"late" + 0.011*"passengers" + 0.010*"gate"

0.054*"london" + 0.046*"gatwick" + 0.044*"manchester" + 0.043*"ryanair" + 0.025*"heathrow" + 0.025*"lisbon" + 0.022*"journey" + 0.022*"return" + 0.022*"barcelona" + 0.018*"rome"

0.030*"class" + 0.024*"business" + 0.023*"good" + 0.023*"food" + 0.021*"service" + 0.017*"economy" + 0.012*"seats" + 0.011*"cabin" + 0.011*"seat" + 0.010*"excellent"

0.019*"2014" + 0.017*"time" + 0.016*"good" + 0.016*"great" + 0.015*"staff" + 0.014*"food" + 0.014*"trip" + 0.012*"lax" + 0.012*"jfk" + 0.011*"united"

0.036*"singa

In [76]:
for topic,score in lda.get_document_topics(dictionary.doc2bow(texts[1])):
    print(topic,score)

0 0.099999234
1 0.09999865
2 0.09999853
3 0.100004084
4 0.09999854
5 0.099999994
6 0.10000332
7 0.09999989
8 0.099998996
9 0.09999876


In [77]:
topics_labels = {
   0: "Bag", 
   1: "Duration", 
   2: "City",
   3: "Class",
   4: "Year",
   5: "?",
    6: "Food",
    7: "Service",
    8: "Seat"
}

In [78]:
[lda.get_document_topics(doc) for doc in corpus[:10000]]

[[(0, 0.014278865), (1, 0.014278176), (2, 0.60155463), (3, 0.14433475), (4, 0.15416107), (5, 0.014278214), (6, 0.014279704), (7, 0.014278212), (8, 0.014278186), (9, 0.01427818)], [(0, 0.099999234), (1, 0.09999865), (2, 0.09999853), (3, 0.100004084), (4, 0.09999854), (5, 0.099999994), (6, 0.10000332), (7, 0.09999989), (8, 0.099998996), (9, 0.09999876)], [(0, 0.049333986), (1, 0.049333815), (2, 0.049342867), (3, 0.049335416), (4, 0.049333785), (5, 0.55598295), (6, 0.0493352), (7, 0.049334183), (8, 0.049333915), (9, 0.049333848)], [(0, 0.046571538), (1, 0.046571296), (2, 0.04657124), (3, 0.580855), (4, 0.04657125), (5, 0.046571862), (6, 0.046573244), (7, 0.046571825), (8, 0.046571437), (9, 0.04657134)], [(0, 0.0108529525), (1, 0.010853212), (2, 0.4598299), (3, 0.4533435), (4, 0.010852932), (5, 0.010855338), (6, 0.010853305), (7, 0.010852973), (8, 0.010852946), (9, 0.0108529385)], [(0, 0.016391244), (1, 0.016391223), (2, 0.016394509), (3, 0.528399), (4, 0.17533247), (5, 0.18152611), (6, 0.

In [79]:
doc_topics = [lda.get_document_topics(doc) for doc in corpus[:10000]]
topic_data = []

for document_id, topics in enumerate(doc_topics):   
    document_topics = []  
    for topic, probability in topics:  
        topic_data.append({
            'document_id':  document_id,
            'topic_id':     topic,
            'topic':        topics_labels[topic],
            'probability':  probability
        })
topics_df = pd.DataFrame(topic_data)
topics_df.pivot_table(values="probability", index=["document_id", "topic"]).T

KeyError: 9

In [None]:
import pprint
pp = pprint.PrettyPrinter(depth=6)
pp.pprint(topics_df)

-----