In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
reviews_df = pd.read_csv('../input/Reviews.csv', usecols=['HelpfulnessNumerator','HelpfulnessDenominator','Score','Summary','Text'])
print(reviews_df.shape)
reviews_df.head()

In [None]:
print(reviews_df.Summary.isnull().sum())
reviews_df = reviews_df.dropna()
print(reviews_df.Summary.isnull().sum())

In [None]:
reviews_df['Usefulness'] = (reviews_df['HelpfulnessNumerator']/reviews_df['HelpfulnessDenominator']).apply(lambda x:'useful' if x>0.8 else 'useless')
reviews_df = reviews_df.iloc[:,2:]
reviews_df = reviews_df[reviews_df.Score!=3]

In [None]:
reviews_df['Score'] = reviews_df['Score'].apply(lambda x:'pos' if x>3 else 'neg')
reviews_df.head()

In [None]:
reviews_df.Score.value_counts().plot(kind='bar')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer(min_df = 1, ngram_range = (1,4))
X_train_counts = vectorizer.fit_transform(reviews_df.Summary[0:2])
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
X_train_counts.todense()

In [None]:
vectorizer.get_feature_names()

In [None]:
print(len(vectorizer.get_feature_names()))
print(len(X_train_counts.toarray()[0]))

In [None]:
X_test = vectorizer.transform([reviews_df.Summary[3]])
X_test.toarray()

In [None]:
import gensim
gensim.__version__

In [None]:
import re
def preprocessing(content):
    content = content.lower()
    regex = re.compile('[^a-z]')
    #First parameter is the replacement, second parameter is your input string
    return regex.sub(' ', content).strip()

In [None]:
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
def getSentences(docs):
    for doc in docs:
        doc = preprocessing(doc)
        for sent in sent_tokenize(doc):
            yield wordpunct_tokenize(sent)
print(reviews_df.Summary[0],'\n', reviews_df.Summary[1])
print('-'*10)
for sent in getSentences(reviews_df.Summary[:10]):
    print(sent)

In [None]:
from gensim.models import Word2Vec
min_count = 1
size = 5
window = 3
sentences = list(getSentences(reviews_df.Summary[:2]))
model = Word2Vec(sentences, min_count=min_count, size=size, window=window)
model.wv.vocab.keys()

In [None]:
#update model
new_sentences = list(getSentences(reviews_df.Summary[3:4]))
model.build_vocab(new_sentences, update=True)
model.train(new_sentences, total_examples=model.corpus_count, epochs=model.epochs)


In [None]:
model.wv.vocab.keys()

In [None]:
from gensim.models.phrases import Phrases, Phraser
sentences = list(getSentences(reviews_df.Summary[:2]))

phrases = Phrases(sentences,
                  min_count=2,
                  threshold = 0.5, scoring = 'npmi')
phrase_model = Phraser(phrases)
phrases.vocab

In [None]:
# Get all phrases and there scores in model
sent_phrases = phrases.export_phrases(sentences)
for p in sent_phrases:
    print(p)

In [None]:
# Updating Phraser model
#new_sentences = list(getSentences(reviews_df.Summary[3:4]))
phrases.add_vocab(sentences)
phrase_model = Phraser(phrases)
# Get all phrases and there scores in model
sent_phrases = phrases.export_phrases(sentences)
for p in sent_phrases:
    print(p)

In [None]:
def get_phrased_sentences(phrases_model, tokened_sentences):
    for sentence in tokened_sentences: 
        yield phrases_model[sentence]

for s in get_phrased_sentences(phrase_model, sentences):
        print(s)

In [None]:
# phrase2vec model
min_count = 1
size = 5
window = 3
sentences = list(get_phrased_sentences(phrase_model, reviews_df.Summary[:2]))
phrase2vec_model = Word2Vec(sentences, min_count=min_count, size=size, window=window)
phrase2vec_model.wv.vocab.keys()

In [None]:
phrase2vec_model.wv.get_vector('good_quality')

# Phrase2vec model for whole corpus

In [None]:
tokened_sentences = list(getSentences(reviews_df.Summary))

In [None]:
def get_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=2,
                      threshold = 0.5, scoring='npmi')
    return phrases

In [None]:
phrases_full = get_phrases(tokened_sentences)
phrasemodel_full  = Phraser(phrases_full)

In [None]:
for p in phrases_full.export_phrases(tokened_sentences[:10]):
    print(p)

In [None]:
tokened_sentences[:10]

In [None]:
phrased_sentences = list(get_phrased_sentences(phrasemodel_full, tokened_sentences))

In [None]:
min_count = 1
size = 5
window = 3
phrase2vec_full = Word2Vec(phrased_sentences, min_count=min_count, size=size, window=window)
phrase2vec_full.wv.vocab.keys()


# WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib as mpl

stopwords = set(STOPWORDS)

#mpl.rcParams['figure.figsize']=(8.0,6.0)    #(6.0,4.0)
mpl.rcParams['font.size']=12                #10 
mpl.rcParams['savefig.dpi']=100             #72 
mpl.rcParams['figure.subplot.bottom']=.1 


def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))
    
    fig = plt.figure(1, figsize=(8, 8))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
    
show_wordcloud(reviews_df.Summary)