1: You don't need to replicate every step we did with the new dataset. Just roughly follow the steps in the lab: Data Preparation, Data Transformation, Data Preprocessing, Data Exploration. Pretend you're preparing to actually use that dataset. Make sure you include the steps you need to include.


# Data Preparation

In [70]:
# necessary for when working with external scripts
%load_ext autoreload
%autoreload 2

In [183]:
import pandas as pd
import helpers.data_mining_helpers as dmh

reviews_yelp = pd.read_csv('sentiment labelled sentences/yelp_labelled.txt', sep="\t", names=['Sentence', 'Sentiment'])
reviews_yelp['Source'] = 'Yelp'

reviews_imdb = pd.read_csv('sentiment labelled sentences/imdb_labelled.txt', sep="\t", names=['Sentence', 'Sentiment'])
reviews_imdb['Source'] = 'IMDB'

reviews_amazon = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt', sep="\t", names=['Sentence', 'Sentiment'])
reviews_amazon['Source'] = 'Amazon'

reviews = pd.concat([reviews_yelp, reviews_imdb, reviews_amazon]).reset_index(drop=True)

reviews['Sentiment']= reviews['Sentiment'].astype(str)
reviews

Unnamed: 0,Sentence,Sentiment,Source
0,Wow... Loved this place.,1,Yelp
1,Crust is not good.,0,Yelp
2,Not tasty and the texture was just nasty.,0,Yelp
3,Stopped by during the late May bank holiday of...,1,Yelp
4,The selection on the menu was great and so wer...,1,Yelp
...,...,...,...
2743,The screen does get smudged easily because it ...,0,Amazon
2744,What a piece of junk.. I lose more calls on th...,0,Amazon
2745,Item Does Not Match Picture.,0,Amazon
2746,The only thing that disappoint me is the infra...,0,Amazon


# Data Transformation

In [None]:
# dataframe and relevant columns already converted in above step
# doing some queries to become familiar with data

In [66]:
# query one random record
reviews.loc[1800]

Sentence     A usable keyboard actually turns a PDA into a ...
Sentiment                                                    1
Source                                                  Amazon
Name: 1800, dtype: object

In [67]:
# see whole sentence
reviews.loc[1800, 'Sentence']

'A usable keyboard actually turns a PDA into a real-world useful machine instead of just a neat gadget.'

In [59]:
# show every 100th record, to verify all sources are there
reviews.iloc[::100]

Unnamed: 0,Sentence,Sentiment,Source
0,Wow... Loved this place.,1,Yelp
100,Our server was fantastic and when he found out...,1,Yelp
200,"I had heard good things about this place, but ...",1,Yelp
300,Good beer & drink selection and good food sele...,1,Yelp
400,This one is simply a disappointment.,0,Yelp
500,I also had to taste my Mom's multi-grain pumpk...,1,Yelp
600,I miss it and wish they had one in Philadelphia!,1,Yelp
700,The chips that came out were dripping with gre...,0,Yelp
800,I'm super pissd.,0,Yelp
900,Spend your money elsewhere.,0,Yelp


In [68]:
# return every 5 sentences, show top 10
reviews.loc[::5, "Sentence"][:10]

0                              Wow... Loved this place.
5        Now I am getting angry and I want my damn pho.
10                             Service was very prompt.
15    I was shocked because no signs indicate cash o...
20                                  The Burrittos Blah!
25    That's right....the red velvet cake.....ohhh t...
30    Also there are combos like a burger, fries, an...
35    The only redeeming quality of the restaurant w...
40                         The shrimp tender and moist.
45    The only thing I did like was the prime rib an...
Name: Sentence, dtype: object

In [69]:
# check for missing values
reviews.isnull()

Unnamed: 0,Sentence,Sentiment,Source
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
2743,False,False,False
2744,False,False,False
2745,False,False,False
2746,False,False,False


In [72]:
reviews.isnull().apply(lambda x: dmh.check_missing_values(x))

Sentence     (The amoung of missing records is: , 0)
Sentiment    (The amoung of missing records is: , 0)
Source       (The amoung of missing records is: , 0)
dtype: object

In [73]:
# check for duplicates
reviews.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2743    False
2744    False
2745    False
2746    False
2747    False
Length: 2748, dtype: bool

In [74]:
sum(reviews.duplicated())

17

In [77]:
# show duplicate records
# based on the output, I have decided to KEEP duplicates as it is plausible for different people to give similar reviews.
# Dropping duplicates could affect the integrity of the data if it is 2 different people giving similarly worded reviews
reviews[reviews.duplicated(keep=False)]

Unnamed: 0,Sentence,Sentiment,Source
334,I love this place.,1,Yelp
380,I won't be back.,0,Yelp
383,The food was terrible.,0,Yelp
505,I would not recommend this place.,0,Yelp
814,I love this place.,1,Yelp
816,The food was terrible.,0,Yelp
843,I won't be back.,0,Yelp
846,I would not recommend this place.,0,Yelp
1029,Definitely worth checking out.,1,IMDB
1064,10/10,1,IMDB


In [79]:
reviews.to_csv('reviews.csv')

# Data Preprocessing

In [124]:
# cleaning data
import string
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords


def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]


# lowercase
reviews['Sentence']=reviews['Sentence'].str.lower()

# remove symbols
reviews['Sentence']=reviews['Sentence'].replace('[^a-zA-Z0-9 ]', '', regex=True)

reviews.to_csv('old.csv')

# lemmatize
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer=WordNetLemmatizer()

reviews['Sentence'] = reviews['Sentence'].apply(lemmatize_text)
reviews['Sentence'] = reviews['Sentence'].apply(' '.join)

reviews.to_csv('new.csv')

# remove stop words
stop_words = set(stopwords.words('english'))
reviews['Sentence'] = reviews['Sentence'].apply(lambda x: [item for item in x.split() if item not in stop_words])
reviews['Sentence'] = reviews['Sentence'].apply(' '.join)

reviews

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tonigarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tonigarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Sentence,Sentiment,Source
0,wow loved place,1,Yelp
1,crust good,0,Yelp
2,tasty texture wa nasty,0,Yelp
3,stopped late may bank holiday rick steve recom...,1,Yelp
4,selection menu wa great price,1,Yelp
...,...,...,...
2743,screen doe get smudged easily touch ear face,0,Amazon
2744,piece junk lose call phone,0,Amazon
2745,item doe match picture,0,Amazon
2746,thing disappoint infra red port irda,0,Amazon


In [125]:
# Feature Creation
import nltk
from sklearn.feature_extraction.text import CountVectorizer

reviews['Unigrams'] = reviews['Sentence'].apply(lambda x: dmh.tokenize_text(x))

count_vect = CountVectorizer()
reviews_counts = count_vect.fit_transform(reviews['Sentence'])
analyze = count_vect.build_analyzer()

reviews['Unigrams']

0                                     [wow, loved, place]
1                                           [crust, good]
2                             [tasty, texture, wa, nasty]
3       [stopped, late, may, bank, holiday, rick, stev...
4                     [selection, menu, wa, great, price]
                              ...                        
2743    [screen, doe, get, smudged, easily, touch, ear...
2744                     [piece, junk, lose, call, phone]
2745                          [item, doe, match, picture]
2746          [thing, disappoint, infra, red, port, irda]
2747                  [answer, call, unit, never, worked]
Name: Unigrams, Length: 2748, dtype: object

In [126]:
# Feature Subset Selection: Create Document-Term Matrix
matrix = pd.DataFrame(reviews_counts.toarray(), 
                      columns=count_vect.get_feature_names())

matrix

Unnamed: 0,010,0a,0again,0an,0and,0another,0as,0at,0avoid,0bad,...,yukon,yum,yummy,yun,z500a,zero,zillion,zombie,zombiestudents,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
# Atrribute Transformation / Aggregation
import numpy as np

# getting term frequency of documents
term_frequencies = []
for j in range(0,reviews_counts.shape[1]):
    term_frequencies.append(sum(reviews_counts[:,j].toarray()))
    
term_frequencies = np.asarray(reviews_counts.sum(axis=0))[0]

termfreq = pd.DataFrame( {'term':count_vect.get_feature_names(), 'freq': term_frequencies })
termfreq

Unnamed: 0,term,freq
0,010,2
1,0a,1
2,0again,1
3,0an,1
4,0and,3
...,...,...
4933,zero,6
4934,zillion,1
4935,zombie,1
4936,zombiestudents,1


In [176]:
# check for duplicates
sum(termfreq.duplicated())

0

# Data Exploration

In [185]:
# Visualization 1: positive negative scatter word

# import necessary libraries
import scattertext as st
import spacy
from pprint import pprint
from IPython.core.display import display, HTML
from IPython.display import IFrame


# Turn it into a Scattertext Corpus 
# takes a few minutes
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(reviews, 
                            category_col='Sentiment', 
                              text_col='Sentence',
                              nlp=nlp).build()

# create visualization
html = st.produce_scattertext_explorer(corpus,
          category="1", category_name='Positive', not_category_name='Negative', term_scorer = st.CredTFIDF(corpus))

open("New_Data.html", 'wb').write(html.encode('utf-8'))
IFrame(src='New_Data.html', width = 1000, height=700)
# Please open New_Data.html if you want to use search.

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  corpus_unigram_freq = corpus_freq_df.ix[[term for term


In [None]:
# Visualization 2: bar chart of sources

In [None]:
# Visualization 3: term freq
# https://www.data-to-viz.com/story/OneNumOneCat.html
    
# use sampling