In [None]:
import re
import itertools
from __future__ import print_function
import pandas as pd
import datetime as dt
import numpy as np
import nltk
import string
import scipy.sparse as sparse
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn-white')

re_spaces = re.compile(r'\s+')

from mpl_toolkits.mplot3d import axes3d
from nltk.corpus import stopwords

In [None]:
def cleaned_reviews(x):
    return(''.join(re.sub('[^a-zA-Z_]', ' ', x)))
    #return(''.join(re.sub('[^a-zA-Z_]', ' ', x).replace(':',' ').lower().replace('\r',' ').replace('!',' ')))

In [None]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

In [None]:
from nltk.tokenize import RegexpTokenizer
def get_bigram_likelihood(statements, freq_filter=3, nbest=200):
    """
    Returns n (likelihood ratio) bi-grams from a group of documents
    :param        statements: list of strings
    :param output_file: output path for saved file
    :param freq_filter: filter for # of appearances in bi-gram
    :param       nbest: likelihood ratio for bi-grams
    """

    #words = list()
    #tokenize sentence into words
    #for statement in statements:
        # remove non-words
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(statements)

    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bigram_finder = BigramCollocationFinder.from_words(words)

    # only bi-grams that appear n+ times
    bigram_finder.apply_freq_filter(freq_filter)

    # TODO: use custom stop words
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english'))

    bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest)

    return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)

In [None]:
table_data = pd.read_csv('All_Data')

In [None]:
df2 = table_data.copy()

In [None]:
df2 = df2.rename(index=str, columns={"Unnamed: 0": "ID"})

In [None]:
df2['Ratings'] = df2['Ratings'].apply(lambda x: str(x).split()[0]).apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [None]:
df2['Ratings'].sort_values().reset_index(drop=True).dropna().plot()

In [None]:
df2['Reviews'] = df2['Reviews'].apply(lambda x : cleaned_reviews(str(x)))

In [None]:
df2['Reviews']

In [None]:
len(df2)

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [None]:
df2['Comments'] = df2['Reviews'].apply(lambda x: np.concatenate(np.array([word_tokenize(x)])))

In [None]:
df2['Comments']

In [None]:
def reattach_contractions(wordlist):
    words = []
    for i, word in enumerate(wordlist):
        if word[0] == "'" or word == "n't":
            words[-1] = words[-1] + word
        else:
            words.append(word)
    return words

In [None]:
df2['Comments'] = df2['Comments'].apply(lambda x: reattach_contractions(x))

In [None]:
df2['Comments']

In [None]:
def bigramify(words):
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(3) 
    return finder.nbest(bigram_measures.pmi, 3)

In [None]:
df2['Comments'] = df2['Comments'].apply(lambda x: bigramify(x))

In [None]:
df2['Comments']

In [None]:
def sample_reviews(id):
    bigrams_array = df2[df2['ID'] == id]['Comments'].values
    review_texts = df2[df2['ID'] == id]['Reviews'].values
    bigrams_list = bigrams_array.tolist()
    bigrams = []
    for item in bigrams_list:
        for x in item:
            bigrams.append(x)
    if bigrams:
        sample_reviews = []
        review_texts = review_texts
        for bigram in bigrams:
            sample_review_list = list(filter(lambda txt: " ".join(bigram) in txt, review_texts))
            num_reviews = len(sample_review_list)
            if num_reviews != 0:
                sample_review = sample_review_list[0]
                sample_review = sample_review.replace(" ".join(bigram), "****" + " ".join(bigram) + "****")
                start_index = sample_review.index("****")
                sample_text = sample_review[start_index - len(sample_review): start_index + len(sample_review)]
                sample_reviews.append(sample_text)
        return sample_reviews
    else:
        return (list(review_texts))

In [None]:
df2['Comments'] = df2['ID'].apply(lambda x: sample_reviews(x))

In [None]:
df2['Comments']

In [None]:
df2['Comments'] = df2['Comments'].apply(lambda x: ' '.join(x))

In [None]:
df2['Comments']

In [None]:
df2

# Sentiment Analysis of reviews

In [None]:
import nltk
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
def review_sentiment(string):
    sent = analyser.polarity_scores(string)
    return sent

In [None]:
df_preCol = df2.copy()

In [None]:
df_posCol = df2.copy()

In [None]:
df_preCol['Sentiments'] = df_preCol['Reviews'].apply(lambda x: review_sentiment(x))

In [None]:
df_posCol['Sentiments'] = df_posCol['Comments'].apply(lambda x: review_sentiment(x))

In [None]:
df_preCol = pd.concat([df_preCol.drop(['Sentiments'], axis=1), df_preCol['Sentiments'].apply(pd.Series)], axis=1)

In [None]:
df_posCol = pd.concat([df_posCol.drop(['Sentiments'], axis=1), df_posCol['Sentiments'].apply(pd.Series)], axis=1)

In [None]:
df_preCol

In [None]:
df_posCol

In [None]:
scored_reviews = pd.DataFrame()
scored_reviews['review'] = df_posCol['Comments']
scored_reviews['compound'] = df_posCol['compound']
scored_reviews['negativity'] = df_posCol['neg']
scored_reviews['neutrality'] = df_posCol['neu']
scored_reviews['positivity'] = df_posCol['pos']

In [None]:
(scored_reviews['neutrality']).plot(kind='hist')

In [None]:
(scored_reviews['positivity']).plot(kind='hist')

In [None]:
(scored_reviews['negativity']).plot(kind='hist')

In [None]:
scored_reviews

In [None]:
scored_reviews.query('negativity > 0')

In [None]:
scored_reviews.query('negativity > positivity').query('negativity > 0.1')

In [None]:
scored_reviews.query('negativity > positivity').query('compound < -0.2')